@inproceedings{mohle-etal-2023-just,
title = "Just Collect, Don{'}t Filter: Noisy Labels Do Not Improve Counterspeech Collection for Languages Without Annotated Resources",
author = {M{\"o}hle, Pauline and
Orlikowski, Matthias and
Cimiano, Philipp},
editor = "Chung, Yi-Ling and
Bonaldi, Helena and
Abercrombie, Gavin and
Guerini, Marco",
booktitle = "Proceedings of the 1st Workshop on CounterSpeech for Online Abuse (CS4OA)",
month = sep,
year = "2023",
address = "Prague, Czechia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.cs4oa-1.4",
pages = "44--61",
abstract = "Counterspeech on social media is rare. Consequently, it is difficult to collect naturally occurring examples, in particular for languages without annotated datasets. In this work, we study methods to increase the relevance of social media samples for counterspeech annotation when we lack annotated resources. We use the example of sourcing German data for counterspeech annotations from Twitter. We monitor tweets from German politicians and activists to collect replies. To select relevant replies we a) find replies that match German abusive keywords or b) label replies for counterspeech using a multilingual classifier fine-tuned on English data. For both approaches and a baseline setting, we annotate a random sample and use bootstrap sampling to estimate the amount of counterspeech. We find that neither the multilingual model nor the keyword approach achieve significantly higher counts of true counterspeech than the baseline. Thus, keyword lists or multi-lingual classifiers are likely not worth the added complexity beyond purposive data collection: Already without additional filtering, we gather a meaningful sample with 7,4{\%} true counterspeech.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mohle-etal-2023-just">
<titleInfo>
<title>Just Collect, Don’t Filter: Noisy Labels Do Not Improve Counterspeech Collection for Languages Without Annotated Resources</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pauline</namePart>
<namePart type="family">Möhle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthias</namePart>
<namePart type="family">Orlikowski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Cimiano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on CounterSpeech for Online Abuse (CS4OA)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yi-Ling</namePart>
<namePart type="family">Chung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Bonaldi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gavin</namePart>
<namePart type="family">Abercrombie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marco</namePart>
<namePart type="family">Guerini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Prague, Czechia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Counterspeech on social media is rare. Consequently, it is difficult to collect naturally occurring examples, in particular for languages without annotated datasets. In this work, we study methods to increase the relevance of social media samples for counterspeech annotation when we lack annotated resources. We use the example of sourcing German data for counterspeech annotations from Twitter. We monitor tweets from German politicians and activists to collect replies. To select relevant replies we a) find replies that match German abusive keywords or b) label replies for counterspeech using a multilingual classifier fine-tuned on English data. For both approaches and a baseline setting, we annotate a random sample and use bootstrap sampling to estimate the amount of counterspeech. We find that neither the multilingual model nor the keyword approach achieve significantly higher counts of true counterspeech than the baseline. Thus, keyword lists or multi-lingual classifiers are likely not worth the added complexity beyond purposive data collection: Already without additional filtering, we gather a meaningful sample with 7,4% true counterspeech.</abstract>
<identifier type="citekey">mohle-etal-2023-just</identifier>
<location>
<url>https://aclanthology.org/2023.cs4oa-1.4</url>
</location>
<part>
<date>2023-09</date>
<extent unit="page">
<start>44</start>
<end>61</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Just Collect, Don’t Filter: Noisy Labels Do Not Improve Counterspeech Collection for Languages Without Annotated Resources
%A Möhle, Pauline
%A Orlikowski, Matthias
%A Cimiano, Philipp
%Y Chung, Yi-Ling
%Y Bonaldi, Helena
%Y Abercrombie, Gavin
%Y Guerini, Marco
%S Proceedings of the 1st Workshop on CounterSpeech for Online Abuse (CS4OA)
%D 2023
%8 September
%I Association for Computational Linguistics
%C Prague, Czechia
%F mohle-etal-2023-just
%X Counterspeech on social media is rare. Consequently, it is difficult to collect naturally occurring examples, in particular for languages without annotated datasets. In this work, we study methods to increase the relevance of social media samples for counterspeech annotation when we lack annotated resources. We use the example of sourcing German data for counterspeech annotations from Twitter. We monitor tweets from German politicians and activists to collect replies. To select relevant replies we a) find replies that match German abusive keywords or b) label replies for counterspeech using a multilingual classifier fine-tuned on English data. For both approaches and a baseline setting, we annotate a random sample and use bootstrap sampling to estimate the amount of counterspeech. We find that neither the multilingual model nor the keyword approach achieve significantly higher counts of true counterspeech than the baseline. Thus, keyword lists or multi-lingual classifiers are likely not worth the added complexity beyond purposive data collection: Already without additional filtering, we gather a meaningful sample with 7,4% true counterspeech.
%U https://aclanthology.org/2023.cs4oa-1.4
%P 44-61
Markdown (Informal)
[Just Collect, Don’t Filter: Noisy Labels Do Not Improve Counterspeech Collection for Languages Without Annotated Resources](https://aclanthology.org/2023.cs4oa-1.4) (Möhle et al., CS4OA-WS 2023)
ACL