@inproceedings{mozes-etal-2021-frequency,
title = "Frequency-Guided Word Substitutions for Detecting Textual Adversarial Examples",
author = "Mozes, Maximilian and
Stenetorp, Pontus and
Kleinberg, Bennett and
Griffin, Lewis",
editor = "Merlo, Paola and
Tiedemann, Jorg and
Tsarfaty, Reut",
booktitle = "Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume",
month = apr,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.eacl-main.13",
doi = "10.18653/v1/2021.eacl-main.13",
pages = "171--186",
abstract = "Recent efforts have shown that neural text processing models are vulnerable to adversarial examples, but the nature of these examples is poorly understood. In this work, we show that adversarial attacks against CNN, LSTM and Transformer-based classification models perform word substitutions that are identifiable through frequency differences between replaced words and their corresponding substitutions. Based on these findings, we propose frequency-guided word substitutions (FGWS), a simple algorithm exploiting the frequency properties of adversarial word substitutions for the detection of adversarial examples. FGWS achieves strong performance by accurately detecting adversarial examples on the SST-2 and IMDb sentiment datasets, with F1 detection scores of up to 91.4{\%} against RoBERTa-based classification models. We compare our approach against a recently proposed perturbation discrimination framework and show that we outperform it by up to 13.0{\%} F1.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mozes-etal-2021-frequency">
<titleInfo>
<title>Frequency-Guided Word Substitutions for Detecting Textual Adversarial Examples</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maximilian</namePart>
<namePart type="family">Mozes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pontus</namePart>
<namePart type="family">Stenetorp</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bennett</namePart>
<namePart type="family">Kleinberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lewis</namePart>
<namePart type="family">Griffin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume</title>
</titleInfo>
<name type="personal">
<namePart type="given">Paola</namePart>
<namePart type="family">Merlo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jorg</namePart>
<namePart type="family">Tiedemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Reut</namePart>
<namePart type="family">Tsarfaty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent efforts have shown that neural text processing models are vulnerable to adversarial examples, but the nature of these examples is poorly understood. In this work, we show that adversarial attacks against CNN, LSTM and Transformer-based classification models perform word substitutions that are identifiable through frequency differences between replaced words and their corresponding substitutions. Based on these findings, we propose frequency-guided word substitutions (FGWS), a simple algorithm exploiting the frequency properties of adversarial word substitutions for the detection of adversarial examples. FGWS achieves strong performance by accurately detecting adversarial examples on the SST-2 and IMDb sentiment datasets, with F1 detection scores of up to 91.4% against RoBERTa-based classification models. We compare our approach against a recently proposed perturbation discrimination framework and show that we outperform it by up to 13.0% F1.</abstract>
<identifier type="citekey">mozes-etal-2021-frequency</identifier>
<identifier type="doi">10.18653/v1/2021.eacl-main.13</identifier>
<location>
<url>https://aclanthology.org/2021.eacl-main.13</url>
</location>
<part>
<date>2021-04</date>
<extent unit="page">
<start>171</start>
<end>186</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Frequency-Guided Word Substitutions for Detecting Textual Adversarial Examples
%A Mozes, Maximilian
%A Stenetorp, Pontus
%A Kleinberg, Bennett
%A Griffin, Lewis
%Y Merlo, Paola
%Y Tiedemann, Jorg
%Y Tsarfaty, Reut
%S Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume
%D 2021
%8 April
%I Association for Computational Linguistics
%C Online
%F mozes-etal-2021-frequency
%X Recent efforts have shown that neural text processing models are vulnerable to adversarial examples, but the nature of these examples is poorly understood. In this work, we show that adversarial attacks against CNN, LSTM and Transformer-based classification models perform word substitutions that are identifiable through frequency differences between replaced words and their corresponding substitutions. Based on these findings, we propose frequency-guided word substitutions (FGWS), a simple algorithm exploiting the frequency properties of adversarial word substitutions for the detection of adversarial examples. FGWS achieves strong performance by accurately detecting adversarial examples on the SST-2 and IMDb sentiment datasets, with F1 detection scores of up to 91.4% against RoBERTa-based classification models. We compare our approach against a recently proposed perturbation discrimination framework and show that we outperform it by up to 13.0% F1.
%R 10.18653/v1/2021.eacl-main.13
%U https://aclanthology.org/2021.eacl-main.13
%U https://doi.org/10.18653/v1/2021.eacl-main.13
%P 171-186
Markdown (Informal)
[Frequency-Guided Word Substitutions for Detecting Textual Adversarial Examples](https://aclanthology.org/2021.eacl-main.13) (Mozes et al., EACL 2021)
ACL