@inproceedings{guo-sarker-2023-socbert,
title = "{S}oc{BERT}: A Pretrained Model for Social Media Text",
author = "Guo, Yuting and
Sarker, Abeed",
editor = "Tafreshi, Shabnam and
Akula, Arjun and
Sedoc, Jo{\~a}o and
Drozd, Aleksandr and
Rogers, Anna and
Rumshisky, Anna",
booktitle = "Proceedings of the Fourth Workshop on Insights from Negative Results in NLP",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.insights-1.5",
doi = "10.18653/v1/2023.insights-1.5",
pages = "45--52",
abstract = "Pretrained language models (PLMs) on domain-specific data have been proven to be effective for in-domain natural language processing (NLP) tasks. Our work aimed to develop a language model which can be effective for the NLP tasks with the data from diverse social media platforms. We pretrained a language model on Twitter and Reddit posts in English consisting of 929M sequence blocks for 112K steps. We benchmarked our model and 3 transformer-based models{---}BERT, BERTweet, and RoBERTa on 40 social media text classification tasks. The results showed that although our model did not perform the best on all of the tasks, it outperformed the baseline model{---}BERT on most of the tasks, which illustrates the effectiveness of our model. Also, our work provides some insights of how to improve the efficiency of training PLMs.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="guo-sarker-2023-socbert">
<titleInfo>
<title>SocBERT: A Pretrained Model for Social Media Text</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yuting</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abeed</namePart>
<namePart type="family">Sarker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fourth Workshop on Insights from Negative Results in NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shabnam</namePart>
<namePart type="family">Tafreshi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arjun</namePart>
<namePart type="family">Akula</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">João</namePart>
<namePart type="family">Sedoc</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aleksandr</namePart>
<namePart type="family">Drozd</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rumshisky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dubrovnik, Croatia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Pretrained language models (PLMs) on domain-specific data have been proven to be effective for in-domain natural language processing (NLP) tasks. Our work aimed to develop a language model which can be effective for the NLP tasks with the data from diverse social media platforms. We pretrained a language model on Twitter and Reddit posts in English consisting of 929M sequence blocks for 112K steps. We benchmarked our model and 3 transformer-based models—BERT, BERTweet, and RoBERTa on 40 social media text classification tasks. The results showed that although our model did not perform the best on all of the tasks, it outperformed the baseline model—BERT on most of the tasks, which illustrates the effectiveness of our model. Also, our work provides some insights of how to improve the efficiency of training PLMs.</abstract>
<identifier type="citekey">guo-sarker-2023-socbert</identifier>
<identifier type="doi">10.18653/v1/2023.insights-1.5</identifier>
<location>
<url>https://aclanthology.org/2023.insights-1.5</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>45</start>
<end>52</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SocBERT: A Pretrained Model for Social Media Text
%A Guo, Yuting
%A Sarker, Abeed
%Y Tafreshi, Shabnam
%Y Akula, Arjun
%Y Sedoc, João
%Y Drozd, Aleksandr
%Y Rogers, Anna
%Y Rumshisky, Anna
%S Proceedings of the Fourth Workshop on Insights from Negative Results in NLP
%D 2023
%8 May
%I Association for Computational Linguistics
%C Dubrovnik, Croatia
%F guo-sarker-2023-socbert
%X Pretrained language models (PLMs) on domain-specific data have been proven to be effective for in-domain natural language processing (NLP) tasks. Our work aimed to develop a language model which can be effective for the NLP tasks with the data from diverse social media platforms. We pretrained a language model on Twitter and Reddit posts in English consisting of 929M sequence blocks for 112K steps. We benchmarked our model and 3 transformer-based models—BERT, BERTweet, and RoBERTa on 40 social media text classification tasks. The results showed that although our model did not perform the best on all of the tasks, it outperformed the baseline model—BERT on most of the tasks, which illustrates the effectiveness of our model. Also, our work provides some insights of how to improve the efficiency of training PLMs.
%R 10.18653/v1/2023.insights-1.5
%U https://aclanthology.org/2023.insights-1.5
%U https://doi.org/10.18653/v1/2023.insights-1.5
%P 45-52
Markdown (Informal)
[SocBERT: A Pretrained Model for Social Media Text](https://aclanthology.org/2023.insights-1.5) (Guo & Sarker, insights-WS 2023)
ACL