@article{kirov-etal-2024-context,
title = "Context-aware Transliteration of {R}omanized {S}outh {A}sian Languages",
author = "Kirov, Christo and
Johny, Cibu and
Katanova, Anna and
Gutkin, Alexander and
Roark, Brian",
journal = "Computational Linguistics",
volume = "50",
number = "2",
month = jun,
year = "2024",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2024.cl-2.2/",
doi = "10.1162/coli_a_00510",
pages = "475--534",
abstract = "While most transliteration research is focused on single tokens such as named entities{---}for example, transliteration of from the Gujarati script to the Latin script {\textquotedblleft}Ahmedabad{\textquotedblright} footnoteThe most populous city in the Indian state of Gujarat. the informal romanization prevalent in South Asia and elsewhere often requires transliteration of full sentences. The lack of large parallel text collections of full sentence (as opposed to single word) transliterations necessitates incorporation of contextual information into transliteration via non-parallel resources, such as via mono-script text collections. In this article, we present a number of methods for improving transliteration in context for such a use scenario. Some of these methods in fact improve performance without making use of sentential context, allowing for better quantification of the degree to which contextual information in particular is responsible for system improvements. Our final systems, which ultimately rely upon ensembles including large pretrained language models fine-tuned on simulated parallel data, yield substantial improvements over the best previously reported results for full sentence transliteration from Latin to native script on all 12 languages in the Dakshina dataset (Roark et al. 2020), with an overall 3.3{\%} absolute (18.6{\%} relative) mean word-error rate reduction."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kirov-etal-2024-context">
<titleInfo>
<title>Context-aware Transliteration of Romanized South Asian Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christo</namePart>
<namePart type="family">Kirov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cibu</namePart>
<namePart type="family">Johny</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Katanova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexander</namePart>
<namePart type="family">Gutkin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Brian</namePart>
<namePart type="family">Roark</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>While most transliteration research is focused on single tokens such as named entities—for example, transliteration of from the Gujarati script to the Latin script “Ahmedabad” footnoteThe most populous city in the Indian state of Gujarat. the informal romanization prevalent in South Asia and elsewhere often requires transliteration of full sentences. The lack of large parallel text collections of full sentence (as opposed to single word) transliterations necessitates incorporation of contextual information into transliteration via non-parallel resources, such as via mono-script text collections. In this article, we present a number of methods for improving transliteration in context for such a use scenario. Some of these methods in fact improve performance without making use of sentential context, allowing for better quantification of the degree to which contextual information in particular is responsible for system improvements. Our final systems, which ultimately rely upon ensembles including large pretrained language models fine-tuned on simulated parallel data, yield substantial improvements over the best previously reported results for full sentence transliteration from Latin to native script on all 12 languages in the Dakshina dataset (Roark et al. 2020), with an overall 3.3% absolute (18.6% relative) mean word-error rate reduction.</abstract>
<identifier type="citekey">kirov-etal-2024-context</identifier>
<identifier type="doi">10.1162/coli_a_00510</identifier>
<location>
<url>https://aclanthology.org/2024.cl-2.2/</url>
</location>
<part>
<date>2024-06</date>
<detail type="volume"><number>50</number></detail>
<detail type="issue"><number>2</number></detail>
<extent unit="page">
<start>475</start>
<end>534</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Context-aware Transliteration of Romanized South Asian Languages
%A Kirov, Christo
%A Johny, Cibu
%A Katanova, Anna
%A Gutkin, Alexander
%A Roark, Brian
%J Computational Linguistics
%D 2024
%8 June
%V 50
%N 2
%I MIT Press
%C Cambridge, MA
%F kirov-etal-2024-context
%X While most transliteration research is focused on single tokens such as named entities—for example, transliteration of from the Gujarati script to the Latin script “Ahmedabad” footnoteThe most populous city in the Indian state of Gujarat. the informal romanization prevalent in South Asia and elsewhere often requires transliteration of full sentences. The lack of large parallel text collections of full sentence (as opposed to single word) transliterations necessitates incorporation of contextual information into transliteration via non-parallel resources, such as via mono-script text collections. In this article, we present a number of methods for improving transliteration in context for such a use scenario. Some of these methods in fact improve performance without making use of sentential context, allowing for better quantification of the degree to which contextual information in particular is responsible for system improvements. Our final systems, which ultimately rely upon ensembles including large pretrained language models fine-tuned on simulated parallel data, yield substantial improvements over the best previously reported results for full sentence transliteration from Latin to native script on all 12 languages in the Dakshina dataset (Roark et al. 2020), with an overall 3.3% absolute (18.6% relative) mean word-error rate reduction.
%R 10.1162/coli_a_00510
%U https://aclanthology.org/2024.cl-2.2/
%U https://doi.org/10.1162/coli_a_00510
%P 475-534
Markdown (Informal)
[Context-aware Transliteration of Romanized South Asian Languages](https://aclanthology.org/2024.cl-2.2/) (Kirov et al., CL 2024)
ACL