@inproceedings{moeller-hulden-2018-automatic,
title = "Automatic Glossing in a Low-Resource Setting for Language Documentation",
author = "Moeller, Sarah and
Hulden, Mans",
editor = "Klavans, Judith L.",
booktitle = "Proceedings of the Workshop on Computational Modeling of Polysynthetic Languages",
month = aug,
year = "2018",
address = "Santa Fe, New Mexico, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W18-4809",
pages = "84--93",
abstract = "Morphological analysis of morphologically rich and low-resource languages is important to both descriptive linguistics and natural language processing. Field documentary efforts usually procure analyzed data in cooperation with native speakers who are capable of providing some level of linguistic information. Manually annotating such data is very expensive and the traditional process is arguably too slow in the face of language endangerment and loss. We report on a case study of learning to automatically gloss a Nakh-Daghestanian language, Lezgi, from a very small amount of seed data. We compare a conditional random field based sequence labeler and a neural encoder-decoder model and show that a nearly 0.9 F1-score on labeled accuracy of morphemes can be achieved with 3,000 words of transcribed oral text. Errors are mostly limited to morphemes with high allomorphy. These results are potentially useful for developing rapid annotation and fieldwork tools to support documentation of morphologically rich, endangered languages.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="moeller-hulden-2018-automatic">
<titleInfo>
<title>Automatic Glossing in a Low-Resource Setting for Language Documentation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sarah</namePart>
<namePart type="family">Moeller</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mans</namePart>
<namePart type="family">Hulden</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Computational Modeling of Polysynthetic Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Judith</namePart>
<namePart type="given">L</namePart>
<namePart type="family">Klavans</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Santa Fe, New Mexico, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Morphological analysis of morphologically rich and low-resource languages is important to both descriptive linguistics and natural language processing. Field documentary efforts usually procure analyzed data in cooperation with native speakers who are capable of providing some level of linguistic information. Manually annotating such data is very expensive and the traditional process is arguably too slow in the face of language endangerment and loss. We report on a case study of learning to automatically gloss a Nakh-Daghestanian language, Lezgi, from a very small amount of seed data. We compare a conditional random field based sequence labeler and a neural encoder-decoder model and show that a nearly 0.9 F1-score on labeled accuracy of morphemes can be achieved with 3,000 words of transcribed oral text. Errors are mostly limited to morphemes with high allomorphy. These results are potentially useful for developing rapid annotation and fieldwork tools to support documentation of morphologically rich, endangered languages.</abstract>
<identifier type="citekey">moeller-hulden-2018-automatic</identifier>
<location>
<url>https://aclanthology.org/W18-4809</url>
</location>
<part>
<date>2018-08</date>
<extent unit="page">
<start>84</start>
<end>93</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Automatic Glossing in a Low-Resource Setting for Language Documentation
%A Moeller, Sarah
%A Hulden, Mans
%Y Klavans, Judith L.
%S Proceedings of the Workshop on Computational Modeling of Polysynthetic Languages
%D 2018
%8 August
%I Association for Computational Linguistics
%C Santa Fe, New Mexico, USA
%F moeller-hulden-2018-automatic
%X Morphological analysis of morphologically rich and low-resource languages is important to both descriptive linguistics and natural language processing. Field documentary efforts usually procure analyzed data in cooperation with native speakers who are capable of providing some level of linguistic information. Manually annotating such data is very expensive and the traditional process is arguably too slow in the face of language endangerment and loss. We report on a case study of learning to automatically gloss a Nakh-Daghestanian language, Lezgi, from a very small amount of seed data. We compare a conditional random field based sequence labeler and a neural encoder-decoder model and show that a nearly 0.9 F1-score on labeled accuracy of morphemes can be achieved with 3,000 words of transcribed oral text. Errors are mostly limited to morphemes with high allomorphy. These results are potentially useful for developing rapid annotation and fieldwork tools to support documentation of morphologically rich, endangered languages.
%U https://aclanthology.org/W18-4809
%P 84-93
Markdown (Informal)
[Automatic Glossing in a Low-Resource Setting for Language Documentation](https://aclanthology.org/W18-4809) (Moeller & Hulden, PYLO 2018)
ACL