@article{sajjad-etal-2017-statistical,
title = "Statistical Models for Unsupervised, Semi-Supervised Supervised Transliteration Mining",
author = {Sajjad, Hassan and
Schmid, Helmut and
Fraser, Alexander and
Sch{\"u}tze, Hinrich},
journal = "Computational Linguistics",
volume = "43",
number = "2",
month = jun,
year = "2017",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/J17-2003",
doi = "10.1162/COLI_a_00286",
pages = "349--375",
abstract = "We present a generative model that efficiently mines transliteration pairs in a consistent fashion in three different settings: unsupervised, semi-supervised, and supervised transliteration mining. The model interpolates two sub-models, one for the generation of transliteration pairs and one for the generation of non-transliteration pairs (i.e., noise). The model is trained on noisy unlabeled data using the EM algorithm. During training the transliteration sub-model learns to generate transliteration pairs and the fixed non-transliteration model generates the noise pairs. After training, the unlabeled data is disambiguated based on the posterior probabilities of the two sub-models. We evaluate our transliteration mining system on data from a transliteration mining shared task and on parallel corpora. For three out of four language pairs, our system outperforms all semi-supervised and supervised systems that participated in the NEWS 2010 shared task. On word pairs extracted from parallel corpora with fewer than 2{\%} transliteration pairs, our system achieves up to 86.7{\%} F-measure with 77.9{\%} precision and 97.8{\%} recall.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sajjad-etal-2017-statistical">
<titleInfo>
<title>Statistical Models for Unsupervised, Semi-Supervised Supervised Transliteration Mining</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hassan</namePart>
<namePart type="family">Sajjad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helmut</namePart>
<namePart type="family">Schmid</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexander</namePart>
<namePart type="family">Fraser</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hinrich</namePart>
<namePart type="family">Schütze</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2017-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>We present a generative model that efficiently mines transliteration pairs in a consistent fashion in three different settings: unsupervised, semi-supervised, and supervised transliteration mining. The model interpolates two sub-models, one for the generation of transliteration pairs and one for the generation of non-transliteration pairs (i.e., noise). The model is trained on noisy unlabeled data using the EM algorithm. During training the transliteration sub-model learns to generate transliteration pairs and the fixed non-transliteration model generates the noise pairs. After training, the unlabeled data is disambiguated based on the posterior probabilities of the two sub-models. We evaluate our transliteration mining system on data from a transliteration mining shared task and on parallel corpora. For three out of four language pairs, our system outperforms all semi-supervised and supervised systems that participated in the NEWS 2010 shared task. On word pairs extracted from parallel corpora with fewer than 2% transliteration pairs, our system achieves up to 86.7% F-measure with 77.9% precision and 97.8% recall.</abstract>
<identifier type="citekey">sajjad-etal-2017-statistical</identifier>
<identifier type="doi">10.1162/COLI_a_00286</identifier>
<location>
<url>https://aclanthology.org/J17-2003</url>
</location>
<part>
<date>2017-06</date>
<detail type="volume"><number>43</number></detail>
<detail type="issue"><number>2</number></detail>
<extent unit="page">
<start>349</start>
<end>375</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Statistical Models for Unsupervised, Semi-Supervised Supervised Transliteration Mining
%A Sajjad, Hassan
%A Schmid, Helmut
%A Fraser, Alexander
%A Schütze, Hinrich
%J Computational Linguistics
%D 2017
%8 June
%V 43
%N 2
%I MIT Press
%C Cambridge, MA
%F sajjad-etal-2017-statistical
%X We present a generative model that efficiently mines transliteration pairs in a consistent fashion in three different settings: unsupervised, semi-supervised, and supervised transliteration mining. The model interpolates two sub-models, one for the generation of transliteration pairs and one for the generation of non-transliteration pairs (i.e., noise). The model is trained on noisy unlabeled data using the EM algorithm. During training the transliteration sub-model learns to generate transliteration pairs and the fixed non-transliteration model generates the noise pairs. After training, the unlabeled data is disambiguated based on the posterior probabilities of the two sub-models. We evaluate our transliteration mining system on data from a transliteration mining shared task and on parallel corpora. For three out of four language pairs, our system outperforms all semi-supervised and supervised systems that participated in the NEWS 2010 shared task. On word pairs extracted from parallel corpora with fewer than 2% transliteration pairs, our system achieves up to 86.7% F-measure with 77.9% precision and 97.8% recall.
%R 10.1162/COLI_a_00286
%U https://aclanthology.org/J17-2003
%U https://doi.org/10.1162/COLI_a_00286
%P 349-375
Markdown (Informal)
[Statistical Models for Unsupervised, Semi-Supervised Supervised Transliteration Mining](https://aclanthology.org/J17-2003) (Sajjad et al., CL 2017)
ACL