@inproceedings{ebrahimi-etal-2023-meeting,
title = "Meeting the Needs of Low-Resource Languages: The Value of Automatic Alignments via Pretrained Models",
author = "Ebrahimi, Abteen and
McCarthy, Arya D. and
Oncevay, Arturo and
Ortega, John E. and
Chiruzzo, Luis and
Gim{\'e}nez-Lugo, Gustavo and
Coto-Solano, Rolando and
Kann, Katharina",
editor = "Vlachos, Andreas and
Augenstein, Isabelle",
booktitle = "Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.eacl-main.280",
doi = "10.18653/v1/2023.eacl-main.280",
pages = "3912--3926",
abstract = "Large multilingual models have inspired a new class of word alignment methods, which work well for the model{'}s pretraining languages. However, the languages most in need of automatic alignment are low-resource and, thus, not typically included in the pretraining data. In this work, we ask: How do modern aligners perform on unseen languages, and are they better than traditional methods? We contribute gold-standard alignments for Bribri{--}Spanish, Guarani{--}Spanish, Quechua{--}Spanish, and Shipibo-Konibo{--}Spanish. With these, we evaluate state-of-the-art aligners with and without model adaptation to the target language. Finally, we also evaluate the resulting alignments extrinsically through two downstream tasks: named entity recognition and part-of-speech tagging. We find that although transformer-based methods generally outperform traditional models, the two classes of approach remain competitive with each other.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ebrahimi-etal-2023-meeting">
<titleInfo>
<title>Meeting the Needs of Low-Resource Languages: The Value of Automatic Alignments via Pretrained Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Abteen</namePart>
<namePart type="family">Ebrahimi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arya</namePart>
<namePart type="given">D</namePart>
<namePart type="family">McCarthy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arturo</namePart>
<namePart type="family">Oncevay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">John</namePart>
<namePart type="given">E</namePart>
<namePart type="family">Ortega</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gustavo</namePart>
<namePart type="family">Giménez-Lugo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rolando</namePart>
<namePart type="family">Coto-Solano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Katharina</namePart>
<namePart type="family">Kann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Vlachos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isabelle</namePart>
<namePart type="family">Augenstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dubrovnik, Croatia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large multilingual models have inspired a new class of word alignment methods, which work well for the model’s pretraining languages. However, the languages most in need of automatic alignment are low-resource and, thus, not typically included in the pretraining data. In this work, we ask: How do modern aligners perform on unseen languages, and are they better than traditional methods? We contribute gold-standard alignments for Bribri–Spanish, Guarani–Spanish, Quechua–Spanish, and Shipibo-Konibo–Spanish. With these, we evaluate state-of-the-art aligners with and without model adaptation to the target language. Finally, we also evaluate the resulting alignments extrinsically through two downstream tasks: named entity recognition and part-of-speech tagging. We find that although transformer-based methods generally outperform traditional models, the two classes of approach remain competitive with each other.</abstract>
<identifier type="citekey">ebrahimi-etal-2023-meeting</identifier>
<identifier type="doi">10.18653/v1/2023.eacl-main.280</identifier>
<location>
<url>https://aclanthology.org/2023.eacl-main.280</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>3912</start>
<end>3926</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Meeting the Needs of Low-Resource Languages: The Value of Automatic Alignments via Pretrained Models
%A Ebrahimi, Abteen
%A McCarthy, Arya D.
%A Oncevay, Arturo
%A Ortega, John E.
%A Chiruzzo, Luis
%A Giménez-Lugo, Gustavo
%A Coto-Solano, Rolando
%A Kann, Katharina
%Y Vlachos, Andreas
%Y Augenstein, Isabelle
%S Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics
%D 2023
%8 May
%I Association for Computational Linguistics
%C Dubrovnik, Croatia
%F ebrahimi-etal-2023-meeting
%X Large multilingual models have inspired a new class of word alignment methods, which work well for the model’s pretraining languages. However, the languages most in need of automatic alignment are low-resource and, thus, not typically included in the pretraining data. In this work, we ask: How do modern aligners perform on unseen languages, and are they better than traditional methods? We contribute gold-standard alignments for Bribri–Spanish, Guarani–Spanish, Quechua–Spanish, and Shipibo-Konibo–Spanish. With these, we evaluate state-of-the-art aligners with and without model adaptation to the target language. Finally, we also evaluate the resulting alignments extrinsically through two downstream tasks: named entity recognition and part-of-speech tagging. We find that although transformer-based methods generally outperform traditional models, the two classes of approach remain competitive with each other.
%R 10.18653/v1/2023.eacl-main.280
%U https://aclanthology.org/2023.eacl-main.280
%U https://doi.org/10.18653/v1/2023.eacl-main.280
%P 3912-3926
Markdown (Informal)
[Meeting the Needs of Low-Resource Languages: The Value of Automatic Alignments via Pretrained Models](https://aclanthology.org/2023.eacl-main.280) (Ebrahimi et al., EACL 2023)
ACL
- Abteen Ebrahimi, Arya D. McCarthy, Arturo Oncevay, John E. Ortega, Luis Chiruzzo, Gustavo Giménez-Lugo, Rolando Coto-Solano, and Katharina Kann. 2023. Meeting the Needs of Low-Resource Languages: The Value of Automatic Alignments via Pretrained Models. In Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics, pages 3912–3926, Dubrovnik, Croatia. Association for Computational Linguistics.