@inproceedings{kevers-2022-coswid,
title = "{C}o{S}w{ID}, a Code Switching Identification Method Suitable for Under-Resourced Languages",
author = "Kevers, Laurent",
editor = "Melero, Maite and
Sakti, Sakriani and
Soria, Claudia",
booktitle = "Proceedings of the 1st Annual Meeting of the ELRA/ISCA Special Interest Group on Under-Resourced Languages",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.sigul-1.15/",
pages = "112--121",
abstract = "We propose a method for identifying monolingual textual segments in multilingual documents. It requires only a minimal number of linguistic resources {--} word lists and monolingual corpora {--} and can therefore be adapted to many under-resourced languages. Taking these languages into account when processing multilingual documents in NLP tools is important as it can contribute to the creation of essential textual resources. This language identification task {--} code switching detection being its most complex form {--} can also provide added value to various existing data or tools. Our research demonstrates that a language identification module performing well on short texts can be used to efficiently analyse a document through a sliding window. The results obtained for code switching identification {--} between 87.29{\%} and 97.97{\%} accuracy {--} are state-of-the-art, which is confirmed by the benchmarks performed on the few available systems that have been used on our test data."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kevers-2022-coswid">
<titleInfo>
<title>CoSwID, a Code Switching Identification Method Suitable for Under-Resourced Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Laurent</namePart>
<namePart type="family">Kevers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Annual Meeting of the ELRA/ISCA Special Interest Group on Under-Resourced Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maite</namePart>
<namePart type="family">Melero</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claudia</namePart>
<namePart type="family">Soria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We propose a method for identifying monolingual textual segments in multilingual documents. It requires only a minimal number of linguistic resources – word lists and monolingual corpora – and can therefore be adapted to many under-resourced languages. Taking these languages into account when processing multilingual documents in NLP tools is important as it can contribute to the creation of essential textual resources. This language identification task – code switching detection being its most complex form – can also provide added value to various existing data or tools. Our research demonstrates that a language identification module performing well on short texts can be used to efficiently analyse a document through a sliding window. The results obtained for code switching identification – between 87.29% and 97.97% accuracy – are state-of-the-art, which is confirmed by the benchmarks performed on the few available systems that have been used on our test data.</abstract>
<identifier type="citekey">kevers-2022-coswid</identifier>
<location>
<url>https://aclanthology.org/2022.sigul-1.15/</url>
</location>
<part>
<date>2022-06</date>
<extent unit="page">
<start>112</start>
<end>121</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CoSwID, a Code Switching Identification Method Suitable for Under-Resourced Languages
%A Kevers, Laurent
%Y Melero, Maite
%Y Sakti, Sakriani
%Y Soria, Claudia
%S Proceedings of the 1st Annual Meeting of the ELRA/ISCA Special Interest Group on Under-Resourced Languages
%D 2022
%8 June
%I European Language Resources Association
%C Marseille, France
%F kevers-2022-coswid
%X We propose a method for identifying monolingual textual segments in multilingual documents. It requires only a minimal number of linguistic resources – word lists and monolingual corpora – and can therefore be adapted to many under-resourced languages. Taking these languages into account when processing multilingual documents in NLP tools is important as it can contribute to the creation of essential textual resources. This language identification task – code switching detection being its most complex form – can also provide added value to various existing data or tools. Our research demonstrates that a language identification module performing well on short texts can be used to efficiently analyse a document through a sliding window. The results obtained for code switching identification – between 87.29% and 97.97% accuracy – are state-of-the-art, which is confirmed by the benchmarks performed on the few available systems that have been used on our test data.
%U https://aclanthology.org/2022.sigul-1.15/
%P 112-121
Markdown (Informal)
[CoSwID, a Code Switching Identification Method Suitable for Under-Resourced Languages](https://aclanthology.org/2022.sigul-1.15/) (Kevers, SIGUL 2022)
ACL