@inproceedings{guo-etal-2020-benchmarking,
title = "Benchmarking of Transformer-Based Pre-Trained Models on Social Media Text Classification Datasets",
author = "Guo, Yuting and
Dong, Xiangjue and
Al-Garadi, Mohammed Ali and
Sarker, Abeed and
Paris, Cecile and
Aliod, Diego Moll{\'a}",
editor = "Kim, Maria and
Beck, Daniel and
Mistica, Meladel",
booktitle = "Proceedings of the 18th Annual Workshop of the Australasian Language Technology Association",
month = dec,
year = "2020",
address = "Virtual Workshop",
publisher = "Australasian Language Technology Association",
url = "https://aclanthology.org/2020.alta-1.10/",
pages = "86--91",
abstract = "Free text data from social media is now widely used in natural language processing research, and one of the most common machine learning tasks performed on this data is classification. Generally speaking, performances of supervised classification algorithms on social media datasets are lower than those on texts from other sources, but recently-proposed transformer-based models have considerably improved upon legacy state-of-the-art systems. Currently, there is no study that compares the performances of different variants of transformer-based models on a wide range of social media text classification datasets. In this paper, we benchmark the performances of transformer-based pre-trained models on 25 social media text classification datasets, 6 of which are health-related. We compare three pre-trained language models, RoBERTa-base, BERTweet and ClinicalBioBERT in terms of classification accuracy. Our experiments show that RoBERTa-base and BERTweet perform comparably on most datasets, and considerably better than ClinicalBioBERT, even on health-related datasets."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="guo-etal-2020-benchmarking">
<titleInfo>
<title>Benchmarking of Transformer-Based Pre-Trained Models on Social Media Text Classification Datasets</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yuting</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiangjue</namePart>
<namePart type="family">Dong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammed</namePart>
<namePart type="given">Ali</namePart>
<namePart type="family">Al-Garadi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abeed</namePart>
<namePart type="family">Sarker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cecile</namePart>
<namePart type="family">Paris</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diego</namePart>
<namePart type="given">Mollá</namePart>
<namePart type="family">Aliod</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 18th Annual Workshop of the Australasian Language Technology Association</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Beck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Meladel</namePart>
<namePart type="family">Mistica</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Australasian Language Technology Association</publisher>
<place>
<placeTerm type="text">Virtual Workshop</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Free text data from social media is now widely used in natural language processing research, and one of the most common machine learning tasks performed on this data is classification. Generally speaking, performances of supervised classification algorithms on social media datasets are lower than those on texts from other sources, but recently-proposed transformer-based models have considerably improved upon legacy state-of-the-art systems. Currently, there is no study that compares the performances of different variants of transformer-based models on a wide range of social media text classification datasets. In this paper, we benchmark the performances of transformer-based pre-trained models on 25 social media text classification datasets, 6 of which are health-related. We compare three pre-trained language models, RoBERTa-base, BERTweet and ClinicalBioBERT in terms of classification accuracy. Our experiments show that RoBERTa-base and BERTweet perform comparably on most datasets, and considerably better than ClinicalBioBERT, even on health-related datasets.</abstract>
<identifier type="citekey">guo-etal-2020-benchmarking</identifier>
<location>
<url>https://aclanthology.org/2020.alta-1.10/</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>86</start>
<end>91</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Benchmarking of Transformer-Based Pre-Trained Models on Social Media Text Classification Datasets
%A Guo, Yuting
%A Dong, Xiangjue
%A Al-Garadi, Mohammed Ali
%A Sarker, Abeed
%A Paris, Cecile
%A Aliod, Diego Mollá
%Y Kim, Maria
%Y Beck, Daniel
%Y Mistica, Meladel
%S Proceedings of the 18th Annual Workshop of the Australasian Language Technology Association
%D 2020
%8 December
%I Australasian Language Technology Association
%C Virtual Workshop
%F guo-etal-2020-benchmarking
%X Free text data from social media is now widely used in natural language processing research, and one of the most common machine learning tasks performed on this data is classification. Generally speaking, performances of supervised classification algorithms on social media datasets are lower than those on texts from other sources, but recently-proposed transformer-based models have considerably improved upon legacy state-of-the-art systems. Currently, there is no study that compares the performances of different variants of transformer-based models on a wide range of social media text classification datasets. In this paper, we benchmark the performances of transformer-based pre-trained models on 25 social media text classification datasets, 6 of which are health-related. We compare three pre-trained language models, RoBERTa-base, BERTweet and ClinicalBioBERT in terms of classification accuracy. Our experiments show that RoBERTa-base and BERTweet perform comparably on most datasets, and considerably better than ClinicalBioBERT, even on health-related datasets.
%U https://aclanthology.org/2020.alta-1.10/
%P 86-91
Markdown (Informal)
[Benchmarking of Transformer-Based Pre-Trained Models on Social Media Text Classification Datasets](https://aclanthology.org/2020.alta-1.10/) (Guo et al., ALTA 2020)
ACL