@inproceedings{jauhiainen-linden-2024-improving,
title = "Improving Language Coverage on {H}e{LI}-{OTS}",
author = "Jauhiainen, Tommi and
Lind{\'e}n, Krister",
editor = "Melero, Maite and
Sakti, Sakriani and
Soria, Claudia",
booktitle = "Proceedings of the 3rd Annual Meeting of the Special Interest Group on Under-resourced Languages @ LREC-COLING 2024",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.sigul-1.15",
pages = "115--125",
abstract = "In this paper, we add under-resourced languages into the language repertoire of an existing off-the-shelf language identifier, HeLI-OTS. Adding more languages to a language identifier often comes with the drawback of lessened accuracy for the languages already part of the repertoire. We aim to minimize this effect. As sources for training and development data in the new languages, we use the OpenLID and FLORES-200 datasets. They are openly available high-quality datasets that are especially well-suited for language identifier development. By carefully inspecting the effect of each added language and the quality of their training and development data, we managed to add support for 20 new under-resourced languages to HeLI-OTS without affecting the performance of any existing languages to a noticeable extent.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jauhiainen-linden-2024-improving">
<titleInfo>
<title>Improving Language Coverage on HeLI-OTS</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tommi</namePart>
<namePart type="family">Jauhiainen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Krister</namePart>
<namePart type="family">Lindén</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 3rd Annual Meeting of the Special Interest Group on Under-resourced Languages @ LREC-COLING 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maite</namePart>
<namePart type="family">Melero</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claudia</namePart>
<namePart type="family">Soria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper, we add under-resourced languages into the language repertoire of an existing off-the-shelf language identifier, HeLI-OTS. Adding more languages to a language identifier often comes with the drawback of lessened accuracy for the languages already part of the repertoire. We aim to minimize this effect. As sources for training and development data in the new languages, we use the OpenLID and FLORES-200 datasets. They are openly available high-quality datasets that are especially well-suited for language identifier development. By carefully inspecting the effect of each added language and the quality of their training and development data, we managed to add support for 20 new under-resourced languages to HeLI-OTS without affecting the performance of any existing languages to a noticeable extent.</abstract>
<identifier type="citekey">jauhiainen-linden-2024-improving</identifier>
<location>
<url>https://aclanthology.org/2024.sigul-1.15</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>115</start>
<end>125</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Improving Language Coverage on HeLI-OTS
%A Jauhiainen, Tommi
%A Lindén, Krister
%Y Melero, Maite
%Y Sakti, Sakriani
%Y Soria, Claudia
%S Proceedings of the 3rd Annual Meeting of the Special Interest Group on Under-resourced Languages @ LREC-COLING 2024
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F jauhiainen-linden-2024-improving
%X In this paper, we add under-resourced languages into the language repertoire of an existing off-the-shelf language identifier, HeLI-OTS. Adding more languages to a language identifier often comes with the drawback of lessened accuracy for the languages already part of the repertoire. We aim to minimize this effect. As sources for training and development data in the new languages, we use the OpenLID and FLORES-200 datasets. They are openly available high-quality datasets that are especially well-suited for language identifier development. By carefully inspecting the effect of each added language and the quality of their training and development data, we managed to add support for 20 new under-resourced languages to HeLI-OTS without affecting the performance of any existing languages to a noticeable extent.
%U https://aclanthology.org/2024.sigul-1.15
%P 115-125
Markdown (Informal)
[Improving Language Coverage on HeLI-OTS](https://aclanthology.org/2024.sigul-1.15) (Jauhiainen & Lindén, SIGUL-WS 2024)
ACL
- Tommi Jauhiainen and Krister Lindén. 2024. Improving Language Coverage on HeLI-OTS. In Proceedings of the 3rd Annual Meeting of the Special Interest Group on Under-resourced Languages @ LREC-COLING 2024, pages 115–125, Torino, Italia. ELRA and ICCL.