@inproceedings{oncevay-etal-2022-schaman,
title = "{S}ch{A}man: Spell-Checking Resources and Benchmark for Endangered Languages from Amazonia",
author = "Oncevay, Arturo and
Cardoso, Gerardo and
Alva, Carlo and
Lara {\'A}vila, C{\'e}sar and
V{\'a}squez Balarezo, Jovita and
Escobar Rodr{\'\i}guez, Sa{\'u}l and
Siticonatzi Camaiteri, Delio and
Zumaeta Rojas, Esa{\'u} and
L{\'o}pez Francis, Didier and
L{\'o}pez Bautista, Juan and
Acho Rios, Nimia and
Zapata Cesareo, Remigio and
G{\'o}mez Montoya, H{\'e}ctor Erasmo and
Zariquiey, Roberto",
editor = "He, Yulan and
Ji, Heng and
Li, Sujian and
Liu, Yang and
Chang, Chua-Hui",
booktitle = "Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)",
month = nov,
year = "2022",
address = "Online only",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.aacl-short.51",
doi = "10.18653/v1/2022.aacl-short.51",
pages = "411--417",
abstract = "Spell-checkers are core applications in language learning and normalisation, which may enormously contribute to language revitalisation and language teaching in the context of indigenous communities. Spell-checking as a generation task, however, requires large amount of data, which is not feasible for endangered languages, such as the languages spoken in Peruvian Amazonia. We propose here augmentation methods for various misspelling types as a strategy to train neural spell-checking models and we create an evaluation resource for four indigenous languages of Peru: Shipibo-Konibo, Ash{\'a}ninka, Y{\'a}nesha, Yine. We focus on special errors that are significant for learning these languages, such as phoneme-to-grapheme ambiguity, grammatical errors (gender, tense, number, among others), accentuation, punctuation and normalisation in contexts where two or more writing traditions co-exist. We found that an ensemble model, trained with augmented data from various types of error achieves overall better scores in most of the error types and languages. Finally, we released our spell-checkers as a web service to be used by indigenous communities and organisations to develop future language materials.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="oncevay-etal-2022-schaman">
<titleInfo>
<title>SchAman: Spell-Checking Resources and Benchmark for Endangered Languages from Amazonia</title>
</titleInfo>
<name type="personal">
<namePart type="given">Arturo</namePart>
<namePart type="family">Oncevay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gerardo</namePart>
<namePart type="family">Cardoso</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carlo</namePart>
<namePart type="family">Alva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">César</namePart>
<namePart type="family">Lara Ávila</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jovita</namePart>
<namePart type="family">Vásquez Balarezo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saúl</namePart>
<namePart type="family">Escobar Rodríguez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Delio</namePart>
<namePart type="family">Siticonatzi Camaiteri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Esaú</namePart>
<namePart type="family">Zumaeta Rojas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Didier</namePart>
<namePart type="family">López Francis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">López Bautista</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nimia</namePart>
<namePart type="family">Acho Rios</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Remigio</namePart>
<namePart type="family">Zapata Cesareo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Héctor</namePart>
<namePart type="given">Erasmo</namePart>
<namePart type="family">Gómez Montoya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roberto</namePart>
<namePart type="family">Zariquiey</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heng</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sujian</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chua-Hui</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online only</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Spell-checkers are core applications in language learning and normalisation, which may enormously contribute to language revitalisation and language teaching in the context of indigenous communities. Spell-checking as a generation task, however, requires large amount of data, which is not feasible for endangered languages, such as the languages spoken in Peruvian Amazonia. We propose here augmentation methods for various misspelling types as a strategy to train neural spell-checking models and we create an evaluation resource for four indigenous languages of Peru: Shipibo-Konibo, Asháninka, Yánesha, Yine. We focus on special errors that are significant for learning these languages, such as phoneme-to-grapheme ambiguity, grammatical errors (gender, tense, number, among others), accentuation, punctuation and normalisation in contexts where two or more writing traditions co-exist. We found that an ensemble model, trained with augmented data from various types of error achieves overall better scores in most of the error types and languages. Finally, we released our spell-checkers as a web service to be used by indigenous communities and organisations to develop future language materials.</abstract>
<identifier type="citekey">oncevay-etal-2022-schaman</identifier>
<identifier type="doi">10.18653/v1/2022.aacl-short.51</identifier>
<location>
<url>https://aclanthology.org/2022.aacl-short.51</url>
</location>
<part>
<date>2022-11</date>
<extent unit="page">
<start>411</start>
<end>417</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SchAman: Spell-Checking Resources and Benchmark for Endangered Languages from Amazonia
%A Oncevay, Arturo
%A Cardoso, Gerardo
%A Alva, Carlo
%A Lara Ávila, César
%A Vásquez Balarezo, Jovita
%A Escobar Rodríguez, Saúl
%A Siticonatzi Camaiteri, Delio
%A Zumaeta Rojas, Esaú
%A López Francis, Didier
%A López Bautista, Juan
%A Acho Rios, Nimia
%A Zapata Cesareo, Remigio
%A Gómez Montoya, Héctor Erasmo
%A Zariquiey, Roberto
%Y He, Yulan
%Y Ji, Heng
%Y Li, Sujian
%Y Liu, Yang
%Y Chang, Chua-Hui
%S Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)
%D 2022
%8 November
%I Association for Computational Linguistics
%C Online only
%F oncevay-etal-2022-schaman
%X Spell-checkers are core applications in language learning and normalisation, which may enormously contribute to language revitalisation and language teaching in the context of indigenous communities. Spell-checking as a generation task, however, requires large amount of data, which is not feasible for endangered languages, such as the languages spoken in Peruvian Amazonia. We propose here augmentation methods for various misspelling types as a strategy to train neural spell-checking models and we create an evaluation resource for four indigenous languages of Peru: Shipibo-Konibo, Asháninka, Yánesha, Yine. We focus on special errors that are significant for learning these languages, such as phoneme-to-grapheme ambiguity, grammatical errors (gender, tense, number, among others), accentuation, punctuation and normalisation in contexts where two or more writing traditions co-exist. We found that an ensemble model, trained with augmented data from various types of error achieves overall better scores in most of the error types and languages. Finally, we released our spell-checkers as a web service to be used by indigenous communities and organisations to develop future language materials.
%R 10.18653/v1/2022.aacl-short.51
%U https://aclanthology.org/2022.aacl-short.51
%U https://doi.org/10.18653/v1/2022.aacl-short.51
%P 411-417
Markdown (Informal)
[SchAman: Spell-Checking Resources and Benchmark for Endangered Languages from Amazonia](https://aclanthology.org/2022.aacl-short.51) (Oncevay et al., AACL-IJCNLP 2022)
ACL
- Arturo Oncevay, Gerardo Cardoso, Carlo Alva, César Lara Ávila, Jovita Vásquez Balarezo, Saúl Escobar Rodríguez, Delio Siticonatzi Camaiteri, Esaú Zumaeta Rojas, Didier López Francis, Juan López Bautista, Nimia Acho Rios, Remigio Zapata Cesareo, Héctor Erasmo Gómez Montoya, and Roberto Zariquiey. 2022. SchAman: Spell-Checking Resources and Benchmark for Endangered Languages from Amazonia. In Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing (Volume 2: Short Papers), pages 411–417, Online only. Association for Computational Linguistics.