@inproceedings{nguyen-son-etal-2022-checkhard,
title = "{C}heck{HARD}: Checking Hard Labels for Adversarial Text Detection, Prediction Correction, and Perturbed Word Suggestion",
author = "Nguyen-Son, Hoang-Quoc and
Ung, Huy Quang and
Hidano, Seira and
Fukushima, Kazuhide and
Kiyomoto, Shinsaku",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2022",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-emnlp.210/",
doi = "10.18653/v1/2022.findings-emnlp.210",
pages = "2903--2913",
abstract = "An adversarial attack generates harmful text that fools a target model. More dangerously, this text is unrecognizable by humans. Existing work detects adversarial text and corrects a target`s prediction by identifying perturbed words and changing them into their synonyms, but many benign words are also changed. In this paper, we directly detect adversarial text, correct the prediction, and suggest perturbed words by checking the change in the hard labels from the target`s predictions after replacing a word with its transformation using a model that we call CheckHARD. The experiments demonstrate that CheckHARD outperforms existing work on various attacks, models, and datasets."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nguyen-son-etal-2022-checkhard">
<titleInfo>
<title>CheckHARD: Checking Hard Labels for Adversarial Text Detection, Prediction Correction, and Perturbed Word Suggestion</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hoang-Quoc</namePart>
<namePart type="family">Nguyen-Son</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Huy</namePart>
<namePart type="given">Quang</namePart>
<namePart type="family">Ung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seira</namePart>
<namePart type="family">Hidano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kazuhide</namePart>
<namePart type="family">Fukushima</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shinsaku</namePart>
<namePart type="family">Kiyomoto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>An adversarial attack generates harmful text that fools a target model. More dangerously, this text is unrecognizable by humans. Existing work detects adversarial text and corrects a target‘s prediction by identifying perturbed words and changing them into their synonyms, but many benign words are also changed. In this paper, we directly detect adversarial text, correct the prediction, and suggest perturbed words by checking the change in the hard labels from the target‘s predictions after replacing a word with its transformation using a model that we call CheckHARD. The experiments demonstrate that CheckHARD outperforms existing work on various attacks, models, and datasets.</abstract>
<identifier type="citekey">nguyen-son-etal-2022-checkhard</identifier>
<identifier type="doi">10.18653/v1/2022.findings-emnlp.210</identifier>
<location>
<url>https://aclanthology.org/2022.findings-emnlp.210/</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>2903</start>
<end>2913</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CheckHARD: Checking Hard Labels for Adversarial Text Detection, Prediction Correction, and Perturbed Word Suggestion
%A Nguyen-Son, Hoang-Quoc
%A Ung, Huy Quang
%A Hidano, Seira
%A Fukushima, Kazuhide
%A Kiyomoto, Shinsaku
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Findings of the Association for Computational Linguistics: EMNLP 2022
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F nguyen-son-etal-2022-checkhard
%X An adversarial attack generates harmful text that fools a target model. More dangerously, this text is unrecognizable by humans. Existing work detects adversarial text and corrects a target‘s prediction by identifying perturbed words and changing them into their synonyms, but many benign words are also changed. In this paper, we directly detect adversarial text, correct the prediction, and suggest perturbed words by checking the change in the hard labels from the target‘s predictions after replacing a word with its transformation using a model that we call CheckHARD. The experiments demonstrate that CheckHARD outperforms existing work on various attacks, models, and datasets.
%R 10.18653/v1/2022.findings-emnlp.210
%U https://aclanthology.org/2022.findings-emnlp.210/
%U https://doi.org/10.18653/v1/2022.findings-emnlp.210
%P 2903-2913
Markdown (Informal)
[CheckHARD: Checking Hard Labels for Adversarial Text Detection, Prediction Correction, and Perturbed Word Suggestion](https://aclanthology.org/2022.findings-emnlp.210/) (Nguyen-Son et al., Findings 2022)
ACL