@inproceedings{szmyd-etal-2023-trelbert,
title = "{T}rel{BERT}: A pre-trained encoder for {P}olish {T}witter",
author = "Szmyd, Wojciech and
Kotyla, Alicja and
Zobni{\'o}w, Micha{\l} and
Falkiewicz, Piotr and
Bartczuk, Jakub and
Zygad{\l}o, Artur",
editor = "Piskorski, Jakub and
Marci{\'n}czuk, Micha{\l} and
Nakov, Preslav and
Ogrodniczuk, Maciej and
Pollak, Senja and
P{\v{r}}ib{\'a}{\v{n}}, Pavel and
Rybak, Piotr and
Steinberger, Josef and
Yangarber, Roman",
booktitle = "Proceedings of the 9th Workshop on Slavic Natural Language Processing 2023 (SlavicNLP 2023)",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.bsnlp-1.3/",
doi = "10.18653/v1/2023.bsnlp-1.3",
pages = "17--24",
abstract = "Pre-trained Transformer-based models have become immensely popular amongst NLP practitioners. We present TrelBERT {--} the first Polish language model suited for application in the social media domain. TrelBERT is based on an existing general-domain model and adapted to the language of social media by pre-training it further on a large collection of Twitter data. We demonstrate its usefulness by evaluating it in the downstream task of cyberbullying detection, in which it achieves state-of-the-art results, outperforming larger monolingual models trained on general-domain corpora, as well as multilingual in-domain models, by a large margin. We make the model publicly available. We also release a new dataset for the problem of harmful speech detection."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="szmyd-etal-2023-trelbert">
<titleInfo>
<title>TrelBERT: A pre-trained encoder for Polish Twitter</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wojciech</namePart>
<namePart type="family">Szmyd</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alicja</namePart>
<namePart type="family">Kotyla</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michał</namePart>
<namePart type="family">Zobniów</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Piotr</namePart>
<namePart type="family">Falkiewicz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jakub</namePart>
<namePart type="family">Bartczuk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Artur</namePart>
<namePart type="family">Zygadło</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 9th Workshop on Slavic Natural Language Processing 2023 (SlavicNLP 2023)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jakub</namePart>
<namePart type="family">Piskorski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michał</namePart>
<namePart type="family">Marcińczuk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maciej</namePart>
<namePart type="family">Ogrodniczuk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Senja</namePart>
<namePart type="family">Pollak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pavel</namePart>
<namePart type="family">Přibáň</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Piotr</namePart>
<namePart type="family">Rybak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Josef</namePart>
<namePart type="family">Steinberger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roman</namePart>
<namePart type="family">Yangarber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dubrovnik, Croatia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Pre-trained Transformer-based models have become immensely popular amongst NLP practitioners. We present TrelBERT – the first Polish language model suited for application in the social media domain. TrelBERT is based on an existing general-domain model and adapted to the language of social media by pre-training it further on a large collection of Twitter data. We demonstrate its usefulness by evaluating it in the downstream task of cyberbullying detection, in which it achieves state-of-the-art results, outperforming larger monolingual models trained on general-domain corpora, as well as multilingual in-domain models, by a large margin. We make the model publicly available. We also release a new dataset for the problem of harmful speech detection.</abstract>
<identifier type="citekey">szmyd-etal-2023-trelbert</identifier>
<identifier type="doi">10.18653/v1/2023.bsnlp-1.3</identifier>
<location>
<url>https://aclanthology.org/2023.bsnlp-1.3/</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>17</start>
<end>24</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T TrelBERT: A pre-trained encoder for Polish Twitter
%A Szmyd, Wojciech
%A Kotyla, Alicja
%A Zobniów, Michał
%A Falkiewicz, Piotr
%A Bartczuk, Jakub
%A Zygadło, Artur
%Y Piskorski, Jakub
%Y Marcińczuk, Michał
%Y Nakov, Preslav
%Y Ogrodniczuk, Maciej
%Y Pollak, Senja
%Y Přibáň, Pavel
%Y Rybak, Piotr
%Y Steinberger, Josef
%Y Yangarber, Roman
%S Proceedings of the 9th Workshop on Slavic Natural Language Processing 2023 (SlavicNLP 2023)
%D 2023
%8 May
%I Association for Computational Linguistics
%C Dubrovnik, Croatia
%F szmyd-etal-2023-trelbert
%X Pre-trained Transformer-based models have become immensely popular amongst NLP practitioners. We present TrelBERT – the first Polish language model suited for application in the social media domain. TrelBERT is based on an existing general-domain model and adapted to the language of social media by pre-training it further on a large collection of Twitter data. We demonstrate its usefulness by evaluating it in the downstream task of cyberbullying detection, in which it achieves state-of-the-art results, outperforming larger monolingual models trained on general-domain corpora, as well as multilingual in-domain models, by a large margin. We make the model publicly available. We also release a new dataset for the problem of harmful speech detection.
%R 10.18653/v1/2023.bsnlp-1.3
%U https://aclanthology.org/2023.bsnlp-1.3/
%U https://doi.org/10.18653/v1/2023.bsnlp-1.3
%P 17-24
Markdown (Informal)
[TrelBERT: A pre-trained encoder for Polish Twitter](https://aclanthology.org/2023.bsnlp-1.3/) (Szmyd et al., BSNLP 2023)
ACL
- Wojciech Szmyd, Alicja Kotyla, Michał Zobniów, Piotr Falkiewicz, Jakub Bartczuk, and Artur Zygadło. 2023. TrelBERT: A pre-trained encoder for Polish Twitter. In Proceedings of the 9th Workshop on Slavic Natural Language Processing 2023 (SlavicNLP 2023), pages 17–24, Dubrovnik, Croatia. Association for Computational Linguistics.