@inproceedings{stenlund-etal-2023-improving,
title = "Improving Translation Quality for Low-Resource {I}nuktitut with Various Preprocessing Techniques",
author = "Stenlund, Mathias Hans Erik and
Nanni, Mathilde and
Bruton, Micaella and
Beloucif, Meriem",
editor = "Mitkov, Ruslan and
Angelova, Galia",
booktitle = "Proceedings of the 14th International Conference on Recent Advances in Natural Language Processing",
month = sep,
year = "2023",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2023.ranlp-1.53",
pages = "475--479",
abstract = "Neural machine translation has been shown to outperform all other machine translation paradigms when trained in a high-resource setting. However, it still performs poorly when dealing with low-resource languages, for which parallel data for training is scarce. This is especially the case for morphologically complex languages such as Turkish, Tamil, Uyghur, etc. In this paper, we investigate various preprocessing methods for Inuktitut, a low-resource indigenous language from North America, without a morphological analyzer. On both the original and romanized scripts, we test various preprocessing techniques such as Byte-Pair Encoding, random stemming, and data augmentation using Hungarian for the Inuktitut-to-English translation task. We found that there are benefits to retaining the original script as it helps to achieve higher BLEU scores than the romanized models.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="stenlund-etal-2023-improving">
<titleInfo>
<title>Improving Translation Quality for Low-Resource Inuktitut with Various Preprocessing Techniques</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mathias</namePart>
<namePart type="given">Hans</namePart>
<namePart type="given">Erik</namePart>
<namePart type="family">Stenlund</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mathilde</namePart>
<namePart type="family">Nanni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Micaella</namePart>
<namePart type="family">Bruton</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Meriem</namePart>
<namePart type="family">Beloucif</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 14th International Conference on Recent Advances in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Neural machine translation has been shown to outperform all other machine translation paradigms when trained in a high-resource setting. However, it still performs poorly when dealing with low-resource languages, for which parallel data for training is scarce. This is especially the case for morphologically complex languages such as Turkish, Tamil, Uyghur, etc. In this paper, we investigate various preprocessing methods for Inuktitut, a low-resource indigenous language from North America, without a morphological analyzer. On both the original and romanized scripts, we test various preprocessing techniques such as Byte-Pair Encoding, random stemming, and data augmentation using Hungarian for the Inuktitut-to-English translation task. We found that there are benefits to retaining the original script as it helps to achieve higher BLEU scores than the romanized models.</abstract>
<identifier type="citekey">stenlund-etal-2023-improving</identifier>
<location>
<url>https://aclanthology.org/2023.ranlp-1.53</url>
</location>
<part>
<date>2023-09</date>
<extent unit="page">
<start>475</start>
<end>479</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Improving Translation Quality for Low-Resource Inuktitut with Various Preprocessing Techniques
%A Stenlund, Mathias Hans Erik
%A Nanni, Mathilde
%A Bruton, Micaella
%A Beloucif, Meriem
%Y Mitkov, Ruslan
%Y Angelova, Galia
%S Proceedings of the 14th International Conference on Recent Advances in Natural Language Processing
%D 2023
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F stenlund-etal-2023-improving
%X Neural machine translation has been shown to outperform all other machine translation paradigms when trained in a high-resource setting. However, it still performs poorly when dealing with low-resource languages, for which parallel data for training is scarce. This is especially the case for morphologically complex languages such as Turkish, Tamil, Uyghur, etc. In this paper, we investigate various preprocessing methods for Inuktitut, a low-resource indigenous language from North America, without a morphological analyzer. On both the original and romanized scripts, we test various preprocessing techniques such as Byte-Pair Encoding, random stemming, and data augmentation using Hungarian for the Inuktitut-to-English translation task. We found that there are benefits to retaining the original script as it helps to achieve higher BLEU scores than the romanized models.
%U https://aclanthology.org/2023.ranlp-1.53
%P 475-479
Markdown (Informal)
[Improving Translation Quality for Low-Resource Inuktitut with Various Preprocessing Techniques](https://aclanthology.org/2023.ranlp-1.53) (Stenlund et al., RANLP 2023)
ACL