@inproceedings{baert-etal-2020-arabizi,
title = "{A}rabizi Language Models for Sentiment Analysis",
author = "Baert, Ga{\'e}tan and
Gahbiche, Souhir and
Gadek, Guillaume and
Pauchet, Alexandre",
editor = "Scott, Donia and
Bel, Nuria and
Zong, Chengqing",
booktitle = "Proceedings of the 28th International Conference on Computational Linguistics",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2020.coling-main.51",
doi = "10.18653/v1/2020.coling-main.51",
pages = "592--603",
abstract = "Arabizi is a written form of spoken Arabic, relying on Latin characters and digits. It is informal and does not follow any conventional rules, raising many NLP challenges. In particular, Arabizi has recently emerged as the Arabic language in online social networks, becoming of great interest for opinion mining and sentiment analysis. Unfortunately, only few Arabizi resources exist and state-of-the-art language models such as BERT do not consider Arabizi. In this work, we construct and release two datasets: (i) LAD, a corpus of 7.7M tweets written in Arabizi and (ii) SALAD, a subset of LAD, manually annotated for sentiment analysis. Then, a BERT architecture is pre-trained on LAD, in order to create and distribute an Arabizi language model called BAERT. We show that a language model (BAERT) pre-trained on a large corpus (LAD) in the same language (Arabizi) as that of the fine-tuning dataset (SALAD), outperforms a state-of-the-art multi-lingual pretrained model (multilingual BERT) on a sentiment analysis task.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="baert-etal-2020-arabizi">
<titleInfo>
<title>Arabizi Language Models for Sentiment Analysis</title>
</titleInfo>
<name type="personal">
<namePart type="given">Gaétan</namePart>
<namePart type="family">Baert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Souhir</namePart>
<namePart type="family">Gahbiche</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guillaume</namePart>
<namePart type="family">Gadek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexandre</namePart>
<namePart type="family">Pauchet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 28th International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Donia</namePart>
<namePart type="family">Scott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nuria</namePart>
<namePart type="family">Bel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chengqing</namePart>
<namePart type="family">Zong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Barcelona, Spain (Online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Arabizi is a written form of spoken Arabic, relying on Latin characters and digits. It is informal and does not follow any conventional rules, raising many NLP challenges. In particular, Arabizi has recently emerged as the Arabic language in online social networks, becoming of great interest for opinion mining and sentiment analysis. Unfortunately, only few Arabizi resources exist and state-of-the-art language models such as BERT do not consider Arabizi. In this work, we construct and release two datasets: (i) LAD, a corpus of 7.7M tweets written in Arabizi and (ii) SALAD, a subset of LAD, manually annotated for sentiment analysis. Then, a BERT architecture is pre-trained on LAD, in order to create and distribute an Arabizi language model called BAERT. We show that a language model (BAERT) pre-trained on a large corpus (LAD) in the same language (Arabizi) as that of the fine-tuning dataset (SALAD), outperforms a state-of-the-art multi-lingual pretrained model (multilingual BERT) on a sentiment analysis task.</abstract>
<identifier type="citekey">baert-etal-2020-arabizi</identifier>
<identifier type="doi">10.18653/v1/2020.coling-main.51</identifier>
<location>
<url>https://aclanthology.org/2020.coling-main.51</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>592</start>
<end>603</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Arabizi Language Models for Sentiment Analysis
%A Baert, Gaétan
%A Gahbiche, Souhir
%A Gadek, Guillaume
%A Pauchet, Alexandre
%Y Scott, Donia
%Y Bel, Nuria
%Y Zong, Chengqing
%S Proceedings of the 28th International Conference on Computational Linguistics
%D 2020
%8 December
%I International Committee on Computational Linguistics
%C Barcelona, Spain (Online)
%F baert-etal-2020-arabizi
%X Arabizi is a written form of spoken Arabic, relying on Latin characters and digits. It is informal and does not follow any conventional rules, raising many NLP challenges. In particular, Arabizi has recently emerged as the Arabic language in online social networks, becoming of great interest for opinion mining and sentiment analysis. Unfortunately, only few Arabizi resources exist and state-of-the-art language models such as BERT do not consider Arabizi. In this work, we construct and release two datasets: (i) LAD, a corpus of 7.7M tweets written in Arabizi and (ii) SALAD, a subset of LAD, manually annotated for sentiment analysis. Then, a BERT architecture is pre-trained on LAD, in order to create and distribute an Arabizi language model called BAERT. We show that a language model (BAERT) pre-trained on a large corpus (LAD) in the same language (Arabizi) as that of the fine-tuning dataset (SALAD), outperforms a state-of-the-art multi-lingual pretrained model (multilingual BERT) on a sentiment analysis task.
%R 10.18653/v1/2020.coling-main.51
%U https://aclanthology.org/2020.coling-main.51
%U https://doi.org/10.18653/v1/2020.coling-main.51
%P 592-603
Markdown (Informal)
[Arabizi Language Models for Sentiment Analysis](https://aclanthology.org/2020.coling-main.51) (Baert et al., COLING 2020)
ACL
- Gaétan Baert, Souhir Gahbiche, Guillaume Gadek, and Alexandre Pauchet. 2020. Arabizi Language Models for Sentiment Analysis. In Proceedings of the 28th International Conference on Computational Linguistics, pages 592–603, Barcelona, Spain (Online). International Committee on Computational Linguistics.