@inproceedings{dadason-loftsson-2024-text,
title = "Text Filtering Classifiers for Medium-Resource Languages",
author = "Da{\dh}ason, J{\'o}n and
Loftsson, Hrafn",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.1372/",
pages = "15789--15801",
abstract = "Web-crawled corpora are essential resources for linguistic and NLP research, offering far more data than is available from curated corpora. However, they often contain a great deal of low-quality texts which can complicate research and degrade the quality of pre-trained language models. Therefore, they are typically filtered, e.g. by applying rules or classifiers. In this paper, we compare the effectiveness of various text filtering classifiers and measure their impact on language model performance for three medium-resource languages. We present TQ-IS, an Icelandic text quality dataset consisting of 2,000 web-crawled documents, in which spans of low-quality text have been manually identified and labeled. We then evaluate a perplexity-based classifier, a supervised classifier trained on TQ-IS, and a self-supervised classifier trained to discern between documents from curated and web-crawled corpora on Icelandic, Estonian and Basque. We find that these classifiers obtain F1 scores of 94.48{\%}, 99.01{\%} and 93.40{\%}, respectively, when evaluated on the TQ-IS dataset. Furthermore, our results show that while adding filtered web-crawled text to a pre-training corpus can improve downstream performance for pre-trained language models, any improvement is likely to remain modest unless the web-crawled corpus is significantly larger in size."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dadason-loftsson-2024-text">
<titleInfo>
<title>Text Filtering Classifiers for Medium-Resource Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jón</namePart>
<namePart type="family">Da\dhason</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hrafn</namePart>
<namePart type="family">Loftsson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Web-crawled corpora are essential resources for linguistic and NLP research, offering far more data than is available from curated corpora. However, they often contain a great deal of low-quality texts which can complicate research and degrade the quality of pre-trained language models. Therefore, they are typically filtered, e.g. by applying rules or classifiers. In this paper, we compare the effectiveness of various text filtering classifiers and measure their impact on language model performance for three medium-resource languages. We present TQ-IS, an Icelandic text quality dataset consisting of 2,000 web-crawled documents, in which spans of low-quality text have been manually identified and labeled. We then evaluate a perplexity-based classifier, a supervised classifier trained on TQ-IS, and a self-supervised classifier trained to discern between documents from curated and web-crawled corpora on Icelandic, Estonian and Basque. We find that these classifiers obtain F1 scores of 94.48%, 99.01% and 93.40%, respectively, when evaluated on the TQ-IS dataset. Furthermore, our results show that while adding filtered web-crawled text to a pre-training corpus can improve downstream performance for pre-trained language models, any improvement is likely to remain modest unless the web-crawled corpus is significantly larger in size.</abstract>
<identifier type="citekey">dadason-loftsson-2024-text</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.1372/</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>15789</start>
<end>15801</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Text Filtering Classifiers for Medium-Resource Languages
%A Da\dhason, Jón
%A Loftsson, Hrafn
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F dadason-loftsson-2024-text
%X Web-crawled corpora are essential resources for linguistic and NLP research, offering far more data than is available from curated corpora. However, they often contain a great deal of low-quality texts which can complicate research and degrade the quality of pre-trained language models. Therefore, they are typically filtered, e.g. by applying rules or classifiers. In this paper, we compare the effectiveness of various text filtering classifiers and measure their impact on language model performance for three medium-resource languages. We present TQ-IS, an Icelandic text quality dataset consisting of 2,000 web-crawled documents, in which spans of low-quality text have been manually identified and labeled. We then evaluate a perplexity-based classifier, a supervised classifier trained on TQ-IS, and a self-supervised classifier trained to discern between documents from curated and web-crawled corpora on Icelandic, Estonian and Basque. We find that these classifiers obtain F1 scores of 94.48%, 99.01% and 93.40%, respectively, when evaluated on the TQ-IS dataset. Furthermore, our results show that while adding filtered web-crawled text to a pre-training corpus can improve downstream performance for pre-trained language models, any improvement is likely to remain modest unless the web-crawled corpus is significantly larger in size.
%U https://aclanthology.org/2024.lrec-main.1372/
%P 15789-15801
Markdown (Informal)
[Text Filtering Classifiers for Medium-Resource Languages](https://aclanthology.org/2024.lrec-main.1372/) (Daðason & Loftsson, LREC-COLING 2024)
ACL
- Jón Daðason and Hrafn Loftsson. 2024. Text Filtering Classifiers for Medium-Resource Languages. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), pages 15789–15801, Torino, Italia. ELRA and ICCL.