@inproceedings{chung-bright-2024-effectiveness,
title = "On the Effectiveness of Adversarial Robustness for Abuse Mitigation with Counterspeech",
author = "Chung, Yi-Ling and
Bright, Jonathan",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.naacl-long.386/",
doi = "10.18653/v1/2024.naacl-long.386",
pages = "6988--7002",
abstract = "Recent work on automated approaches to counterspeech have mostly focused on synthetic data but seldom look into how the public deals with abuse. While these systems identifying and generating counterspeech have the potential for abuse mitigation, it remains unclear how robust a model is against adversarial attacks across multiple domains and how models trained on synthetic data can handle unseen user-generated abusive content in the real world. To tackle these issues, this paper first explores the dynamics of abuse and replies using our novel dataset of 6,955 labelled tweets targeted at footballers for studying public figure abuse. We then curate DynaCounter, a new English dataset of 1,911 pairs of abuse and replies addressing nine minority identity groups, collected in an adversarial human-in-the-loop process over four rounds. Our analysis shows that adversarial attacks do not necessarily result in better generalisation. We further present a study of multi-domain counterspeech generation, comparing Flan-T5 and T5 models. We observe that handling certain abuse targets is particularly challenging."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chung-bright-2024-effectiveness">
<titleInfo>
<title>On the Effectiveness of Adversarial Robustness for Abuse Mitigation with Counterspeech</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yi-Ling</namePart>
<namePart type="family">Chung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jonathan</namePart>
<namePart type="family">Bright</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent work on automated approaches to counterspeech have mostly focused on synthetic data but seldom look into how the public deals with abuse. While these systems identifying and generating counterspeech have the potential for abuse mitigation, it remains unclear how robust a model is against adversarial attacks across multiple domains and how models trained on synthetic data can handle unseen user-generated abusive content in the real world. To tackle these issues, this paper first explores the dynamics of abuse and replies using our novel dataset of 6,955 labelled tweets targeted at footballers for studying public figure abuse. We then curate DynaCounter, a new English dataset of 1,911 pairs of abuse and replies addressing nine minority identity groups, collected in an adversarial human-in-the-loop process over four rounds. Our analysis shows that adversarial attacks do not necessarily result in better generalisation. We further present a study of multi-domain counterspeech generation, comparing Flan-T5 and T5 models. We observe that handling certain abuse targets is particularly challenging.</abstract>
<identifier type="citekey">chung-bright-2024-effectiveness</identifier>
<identifier type="doi">10.18653/v1/2024.naacl-long.386</identifier>
<location>
<url>https://aclanthology.org/2024.naacl-long.386/</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>6988</start>
<end>7002</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T On the Effectiveness of Adversarial Robustness for Abuse Mitigation with Counterspeech
%A Chung, Yi-Ling
%A Bright, Jonathan
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F chung-bright-2024-effectiveness
%X Recent work on automated approaches to counterspeech have mostly focused on synthetic data but seldom look into how the public deals with abuse. While these systems identifying and generating counterspeech have the potential for abuse mitigation, it remains unclear how robust a model is against adversarial attacks across multiple domains and how models trained on synthetic data can handle unseen user-generated abusive content in the real world. To tackle these issues, this paper first explores the dynamics of abuse and replies using our novel dataset of 6,955 labelled tweets targeted at footballers for studying public figure abuse. We then curate DynaCounter, a new English dataset of 1,911 pairs of abuse and replies addressing nine minority identity groups, collected in an adversarial human-in-the-loop process over four rounds. Our analysis shows that adversarial attacks do not necessarily result in better generalisation. We further present a study of multi-domain counterspeech generation, comparing Flan-T5 and T5 models. We observe that handling certain abuse targets is particularly challenging.
%R 10.18653/v1/2024.naacl-long.386
%U https://aclanthology.org/2024.naacl-long.386/
%U https://doi.org/10.18653/v1/2024.naacl-long.386
%P 6988-7002
Markdown (Informal)
[On the Effectiveness of Adversarial Robustness for Abuse Mitigation with Counterspeech](https://aclanthology.org/2024.naacl-long.386/) (Chung & Bright, NAACL 2024)
ACL