@inproceedings{ranjan-etal-2024-systematic,
title = "A Systematic Exploration of Linguistic Phenomena in Spoken {H}indi: Resource Creation and Hypothesis Testing",
author = "Ranjan, Aadya and
Ranjan, Sidharth and
Rajkumar, Rajakrishnan",
editor = "Lalitha Devi, Sobha and
Arora, Karunesh",
booktitle = "Proceedings of the 21st International Conference on Natural Language Processing (ICON)",
month = dec,
year = "2024",
address = "AU-KBC Research Centre, Chennai, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://aclanthology.org/2024.icon-1.8/",
pages = "68--78",
abstract = "This paper presents a meticulous and well-structured approach to annotating a corpus of Hindi spoken data. We deployed 4 annotators to augment the spoken section of the EMILLE Hindi corpus by marking the various linguistic phenomena observed in spoken data. Then we analyzed various phonological (sound deletion), morphological (code-mixing and reduplication) and syntactic phenomena (case markers and ambiguity), not attested in written data. Code mixing and switching and constitute the majority of the phenomena we annotated, followed by orthographic errors related to symbols in the Devanagiri script. In terms of divergences from written form of Hindi, case marker usage, missing auxiliary verbs and agreement patterns are markedly distinct for spoken Hindi. The annotators also assigned a quality rating to each sentence in the corpus. Our analysis of the quality ratings revealed that most of the sentences in the spoken data corpus are of moderate to high quality. Female speakers produced a greater percentage of high quality sentences compared to their male counterparts. While previous efforts in corpus annotation have been largely focused on creating resources for engineering applications, we illustrate the utility of our dataset for scientific hypothesis testing. Inspired from the Surprisal Theory of language comprehension, we validate the hypothesis that sentences with high values of lexical surprisal are rated low in terms of quality by native speakers, even when controlling for sentence length and word frequencies in a sentence."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ranjan-etal-2024-systematic">
<titleInfo>
<title>A Systematic Exploration of Linguistic Phenomena in Spoken Hindi: Resource Creation and Hypothesis Testing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aadya</namePart>
<namePart type="family">Ranjan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sidharth</namePart>
<namePart type="family">Ranjan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rajakrishnan</namePart>
<namePart type="family">Rajkumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 21st International Conference on Natural Language Processing (ICON)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sobha</namePart>
<namePart type="family">Lalitha Devi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karunesh</namePart>
<namePart type="family">Arora</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>NLP Association of India (NLPAI)</publisher>
<place>
<placeTerm type="text">AU-KBC Research Centre, Chennai, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper presents a meticulous and well-structured approach to annotating a corpus of Hindi spoken data. We deployed 4 annotators to augment the spoken section of the EMILLE Hindi corpus by marking the various linguistic phenomena observed in spoken data. Then we analyzed various phonological (sound deletion), morphological (code-mixing and reduplication) and syntactic phenomena (case markers and ambiguity), not attested in written data. Code mixing and switching and constitute the majority of the phenomena we annotated, followed by orthographic errors related to symbols in the Devanagiri script. In terms of divergences from written form of Hindi, case marker usage, missing auxiliary verbs and agreement patterns are markedly distinct for spoken Hindi. The annotators also assigned a quality rating to each sentence in the corpus. Our analysis of the quality ratings revealed that most of the sentences in the spoken data corpus are of moderate to high quality. Female speakers produced a greater percentage of high quality sentences compared to their male counterparts. While previous efforts in corpus annotation have been largely focused on creating resources for engineering applications, we illustrate the utility of our dataset for scientific hypothesis testing. Inspired from the Surprisal Theory of language comprehension, we validate the hypothesis that sentences with high values of lexical surprisal are rated low in terms of quality by native speakers, even when controlling for sentence length and word frequencies in a sentence.</abstract>
<identifier type="citekey">ranjan-etal-2024-systematic</identifier>
<location>
<url>https://aclanthology.org/2024.icon-1.8/</url>
</location>
<part>
<date>2024-12</date>
<extent unit="page">
<start>68</start>
<end>78</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Systematic Exploration of Linguistic Phenomena in Spoken Hindi: Resource Creation and Hypothesis Testing
%A Ranjan, Aadya
%A Ranjan, Sidharth
%A Rajkumar, Rajakrishnan
%Y Lalitha Devi, Sobha
%Y Arora, Karunesh
%S Proceedings of the 21st International Conference on Natural Language Processing (ICON)
%D 2024
%8 December
%I NLP Association of India (NLPAI)
%C AU-KBC Research Centre, Chennai, India
%F ranjan-etal-2024-systematic
%X This paper presents a meticulous and well-structured approach to annotating a corpus of Hindi spoken data. We deployed 4 annotators to augment the spoken section of the EMILLE Hindi corpus by marking the various linguistic phenomena observed in spoken data. Then we analyzed various phonological (sound deletion), morphological (code-mixing and reduplication) and syntactic phenomena (case markers and ambiguity), not attested in written data. Code mixing and switching and constitute the majority of the phenomena we annotated, followed by orthographic errors related to symbols in the Devanagiri script. In terms of divergences from written form of Hindi, case marker usage, missing auxiliary verbs and agreement patterns are markedly distinct for spoken Hindi. The annotators also assigned a quality rating to each sentence in the corpus. Our analysis of the quality ratings revealed that most of the sentences in the spoken data corpus are of moderate to high quality. Female speakers produced a greater percentage of high quality sentences compared to their male counterparts. While previous efforts in corpus annotation have been largely focused on creating resources for engineering applications, we illustrate the utility of our dataset for scientific hypothesis testing. Inspired from the Surprisal Theory of language comprehension, we validate the hypothesis that sentences with high values of lexical surprisal are rated low in terms of quality by native speakers, even when controlling for sentence length and word frequencies in a sentence.
%U https://aclanthology.org/2024.icon-1.8/
%P 68-78
Markdown (Informal)
[A Systematic Exploration of Linguistic Phenomena in Spoken Hindi: Resource Creation and Hypothesis Testing](https://aclanthology.org/2024.icon-1.8/) (Ranjan et al., ICON 2024)
ACL