@inproceedings{gargova-etal-2022-evaluation,
title = "Evaluation of Off-the-Shelf Language Identification Tools on {B}ulgarian Social Media Posts",
author = "Gargova, Silvia and
Temnikova, Irina and
Dzhumerov, Ivo and
Nikolaeva, Hristiana",
booktitle = "Proceedings of the 5th International Conference on Computational Linguistics in Bulgaria (CLIB 2022)",
month = sep,
year = "2022",
address = "Sofia, Bulgaria",
publisher = "Department of Computational Linguistics, IBL -- BAS",
url = "https://aclanthology.org/2022.clib-1.18",
pages = "152--161",
abstract = "Automatic Language Identification (LI) is a widely addressed task, but not all users (for example linguists) have the means or interest to develop their own tool or to train the existing ones with their own data. There are several off-the-shelf LI tools, but for some languages, it is unclear which tool is the best for specific types of text. This article presents a comparison of the performance of several off-the-shelf language identification tools on Bulgarian social media data. The LI tools are tested on a multilingual Twitter dataset (composed of 2966 tweets) and an existing Bulgarian Twitter dataset on the topic of fake content detection of 3350 tweets. The article presents the manual annotation procedure of the first dataset, a dis- cussion of the decisions of the two annotators, and the results from testing the 7 off-the-shelf LI tools on both datasets. Our findings show that the tool, which is the easiest for users with no programming skills, achieves the highest F1-Score on Bulgarian social media data, while other tools have very useful functionalities for Bulgarian social media texts.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gargova-etal-2022-evaluation">
<titleInfo>
<title>Evaluation of Off-the-Shelf Language Identification Tools on Bulgarian Social Media Posts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Silvia</namePart>
<namePart type="family">Gargova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Irina</namePart>
<namePart type="family">Temnikova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivo</namePart>
<namePart type="family">Dzhumerov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hristiana</namePart>
<namePart type="family">Nikolaeva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 5th International Conference on Computational Linguistics in Bulgaria (CLIB 2022)</title>
</titleInfo>
<originInfo>
<publisher>Department of Computational Linguistics, IBL – BAS</publisher>
<place>
<placeTerm type="text">Sofia, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Automatic Language Identification (LI) is a widely addressed task, but not all users (for example linguists) have the means or interest to develop their own tool or to train the existing ones with their own data. There are several off-the-shelf LI tools, but for some languages, it is unclear which tool is the best for specific types of text. This article presents a comparison of the performance of several off-the-shelf language identification tools on Bulgarian social media data. The LI tools are tested on a multilingual Twitter dataset (composed of 2966 tweets) and an existing Bulgarian Twitter dataset on the topic of fake content detection of 3350 tweets. The article presents the manual annotation procedure of the first dataset, a dis- cussion of the decisions of the two annotators, and the results from testing the 7 off-the-shelf LI tools on both datasets. Our findings show that the tool, which is the easiest for users with no programming skills, achieves the highest F1-Score on Bulgarian social media data, while other tools have very useful functionalities for Bulgarian social media texts.</abstract>
<identifier type="citekey">gargova-etal-2022-evaluation</identifier>
<location>
<url>https://aclanthology.org/2022.clib-1.18</url>
</location>
<part>
<date>2022-09</date>
<extent unit="page">
<start>152</start>
<end>161</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluation of Off-the-Shelf Language Identification Tools on Bulgarian Social Media Posts
%A Gargova, Silvia
%A Temnikova, Irina
%A Dzhumerov, Ivo
%A Nikolaeva, Hristiana
%S Proceedings of the 5th International Conference on Computational Linguistics in Bulgaria (CLIB 2022)
%D 2022
%8 September
%I Department of Computational Linguistics, IBL – BAS
%C Sofia, Bulgaria
%F gargova-etal-2022-evaluation
%X Automatic Language Identification (LI) is a widely addressed task, but not all users (for example linguists) have the means or interest to develop their own tool or to train the existing ones with their own data. There are several off-the-shelf LI tools, but for some languages, it is unclear which tool is the best for specific types of text. This article presents a comparison of the performance of several off-the-shelf language identification tools on Bulgarian social media data. The LI tools are tested on a multilingual Twitter dataset (composed of 2966 tweets) and an existing Bulgarian Twitter dataset on the topic of fake content detection of 3350 tweets. The article presents the manual annotation procedure of the first dataset, a dis- cussion of the decisions of the two annotators, and the results from testing the 7 off-the-shelf LI tools on both datasets. Our findings show that the tool, which is the easiest for users with no programming skills, achieves the highest F1-Score on Bulgarian social media data, while other tools have very useful functionalities for Bulgarian social media texts.
%U https://aclanthology.org/2022.clib-1.18
%P 152-161
Markdown (Informal)
[Evaluation of Off-the-Shelf Language Identification Tools on Bulgarian Social Media Posts](https://aclanthology.org/2022.clib-1.18) (Gargova et al., CLIB 2022)
ACL