@inproceedings{pais-etal-2022-challenges,
title = "Challenges in Creating a Representative Corpus of {R}omanian Micro-Blogging Text",
author = "Pais, Vasile and
Mitrofan, Maria and
Barbu Mititelu, Verginica and
Irimia, Elena and
Micu, Roxana and
Gasan, Carol Luca",
editor = {Banski, Piotr and
Barbaresi, Adrien and
Clematide, Simon and
Kupietz, Marc and
L{\"u}ngen, Harald},
booktitle = "Proceedings of the Workshop on Challenges in the Management of Large Corpora (CMLC-10)",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.cmlc-1.1",
pages = "1--7",
abstract = "Following the successful creation of a national representative corpus of contemporary Romanian language, we turned our attention to the social media text, as present in micro-blogging platforms. In this paper, we present the current activities as well as the challenges faced when trying to apply existing tools (for both annotation and indexing) to a Romanian language micro-blogging corpus. These challenges are encountered at all annotation levels, including tokenization, and at the indexing stage. We consider that existing tools for Romanian language processing must be adapted to recognize features such as emoticons, emojis, hashtags, unusual abbreviations, elongated words (commonly used for emphasis in micro-blogging), multiple words joined together (within oroutside hashtags), and code-mixed text.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pais-etal-2022-challenges">
<titleInfo>
<title>Challenges in Creating a Representative Corpus of Romanian Micro-Blogging Text</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vasile</namePart>
<namePart type="family">Pais</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Mitrofan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Verginica</namePart>
<namePart type="family">Barbu Mititelu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elena</namePart>
<namePart type="family">Irimia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roxana</namePart>
<namePart type="family">Micu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carol</namePart>
<namePart type="given">Luca</namePart>
<namePart type="family">Gasan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Challenges in the Management of Large Corpora (CMLC-10)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Piotr</namePart>
<namePart type="family">Banski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adrien</namePart>
<namePart type="family">Barbaresi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Clematide</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marc</namePart>
<namePart type="family">Kupietz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Harald</namePart>
<namePart type="family">Lüngen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Following the successful creation of a national representative corpus of contemporary Romanian language, we turned our attention to the social media text, as present in micro-blogging platforms. In this paper, we present the current activities as well as the challenges faced when trying to apply existing tools (for both annotation and indexing) to a Romanian language micro-blogging corpus. These challenges are encountered at all annotation levels, including tokenization, and at the indexing stage. We consider that existing tools for Romanian language processing must be adapted to recognize features such as emoticons, emojis, hashtags, unusual abbreviations, elongated words (commonly used for emphasis in micro-blogging), multiple words joined together (within oroutside hashtags), and code-mixed text.</abstract>
<identifier type="citekey">pais-etal-2022-challenges</identifier>
<location>
<url>https://aclanthology.org/2022.cmlc-1.1</url>
</location>
<part>
<date>2022-06</date>
<extent unit="page">
<start>1</start>
<end>7</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Challenges in Creating a Representative Corpus of Romanian Micro-Blogging Text
%A Pais, Vasile
%A Mitrofan, Maria
%A Barbu Mititelu, Verginica
%A Irimia, Elena
%A Micu, Roxana
%A Gasan, Carol Luca
%Y Banski, Piotr
%Y Barbaresi, Adrien
%Y Clematide, Simon
%Y Kupietz, Marc
%Y Lüngen, Harald
%S Proceedings of the Workshop on Challenges in the Management of Large Corpora (CMLC-10)
%D 2022
%8 June
%I European Language Resources Association
%C Marseille, France
%F pais-etal-2022-challenges
%X Following the successful creation of a national representative corpus of contemporary Romanian language, we turned our attention to the social media text, as present in micro-blogging platforms. In this paper, we present the current activities as well as the challenges faced when trying to apply existing tools (for both annotation and indexing) to a Romanian language micro-blogging corpus. These challenges are encountered at all annotation levels, including tokenization, and at the indexing stage. We consider that existing tools for Romanian language processing must be adapted to recognize features such as emoticons, emojis, hashtags, unusual abbreviations, elongated words (commonly used for emphasis in micro-blogging), multiple words joined together (within oroutside hashtags), and code-mixed text.
%U https://aclanthology.org/2022.cmlc-1.1
%P 1-7
Markdown (Informal)
[Challenges in Creating a Representative Corpus of Romanian Micro-Blogging Text](https://aclanthology.org/2022.cmlc-1.1) (Pais et al., CMLC 2022)
ACL