@inproceedings{yin-etal-2022-addmu,
title = "{ADDMU}: Detection of Far-Boundary Adversarial Examples with Data and Model Uncertainty Estimation",
author = "Yin, Fan and
Li, Yao and
Hsieh, Cho-Jui and
Chang, Kai-Wei",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.emnlp-main.440",
doi = "10.18653/v1/2022.emnlp-main.440",
pages = "6567--6584",
abstract = "Adversarial Examples Detection (AED) is a crucial defense technique against adversarial attacks and has drawn increasing attention from the Natural Language Processing (NLP) community. Despite the surge of new AED methods, our studies show that existing methods heavily rely on a shortcut to achieve good performance. In other words, current search-based adversarial attacks in NLP stop once model predictions change, and thus most adversarial examples generated by those attacks are located near model decision boundaries. To surpass this shortcut and fairly evaluate AED methods, we propose to test AED methods with \textbf{F}ar \textbf{B}oundary (\textbf{FB}) adversarial examples. Existing methods show worse than random guess performance under this scenario. To overcome this limitation, we propose a new technique, \textbf{ADDMU}, \textbf{a}dversary \textbf{d}etection with \textbf{d}ata and \textbf{m}odel \textbf{u}ncertainty, which combines two types of uncertainty estimation for both regular and FB adversarial example detection. Our new method outperforms previous methods by 3.6 and 6.0 \textit{AUC} points under each scenario. Finally, our analysis shows that the two types of uncertainty provided by \textbf{ADDMU} can be leveraged to characterize adversarialexamples and identify the ones that contribute most to model{'}s robustness in adversarial training.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yin-etal-2022-addmu">
<titleInfo>
<title>ADDMU: Detection of Far-Boundary Adversarial Examples with Data and Model Uncertainty Estimation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Fan</namePart>
<namePart type="family">Yin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cho-Jui</namePart>
<namePart type="family">Hsieh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai-Wei</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Adversarial Examples Detection (AED) is a crucial defense technique against adversarial attacks and has drawn increasing attention from the Natural Language Processing (NLP) community. Despite the surge of new AED methods, our studies show that existing methods heavily rely on a shortcut to achieve good performance. In other words, current search-based adversarial attacks in NLP stop once model predictions change, and thus most adversarial examples generated by those attacks are located near model decision boundaries. To surpass this shortcut and fairly evaluate AED methods, we propose to test AED methods with Far Boundary (FB) adversarial examples. Existing methods show worse than random guess performance under this scenario. To overcome this limitation, we propose a new technique, ADDMU, adversary detection with data and model uncertainty, which combines two types of uncertainty estimation for both regular and FB adversarial example detection. Our new method outperforms previous methods by 3.6 and 6.0 AUC points under each scenario. Finally, our analysis shows that the two types of uncertainty provided by ADDMU can be leveraged to characterize adversarialexamples and identify the ones that contribute most to model’s robustness in adversarial training.</abstract>
<identifier type="citekey">yin-etal-2022-addmu</identifier>
<identifier type="doi">10.18653/v1/2022.emnlp-main.440</identifier>
<location>
<url>https://aclanthology.org/2022.emnlp-main.440</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>6567</start>
<end>6584</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ADDMU: Detection of Far-Boundary Adversarial Examples with Data and Model Uncertainty Estimation
%A Yin, Fan
%A Li, Yao
%A Hsieh, Cho-Jui
%A Chang, Kai-Wei
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F yin-etal-2022-addmu
%X Adversarial Examples Detection (AED) is a crucial defense technique against adversarial attacks and has drawn increasing attention from the Natural Language Processing (NLP) community. Despite the surge of new AED methods, our studies show that existing methods heavily rely on a shortcut to achieve good performance. In other words, current search-based adversarial attacks in NLP stop once model predictions change, and thus most adversarial examples generated by those attacks are located near model decision boundaries. To surpass this shortcut and fairly evaluate AED methods, we propose to test AED methods with Far Boundary (FB) adversarial examples. Existing methods show worse than random guess performance under this scenario. To overcome this limitation, we propose a new technique, ADDMU, adversary detection with data and model uncertainty, which combines two types of uncertainty estimation for both regular and FB adversarial example detection. Our new method outperforms previous methods by 3.6 and 6.0 AUC points under each scenario. Finally, our analysis shows that the two types of uncertainty provided by ADDMU can be leveraged to characterize adversarialexamples and identify the ones that contribute most to model’s robustness in adversarial training.
%R 10.18653/v1/2022.emnlp-main.440
%U https://aclanthology.org/2022.emnlp-main.440
%U https://doi.org/10.18653/v1/2022.emnlp-main.440
%P 6567-6584
Markdown (Informal)
[ADDMU: Detection of Far-Boundary Adversarial Examples with Data and Model Uncertainty Estimation](https://aclanthology.org/2022.emnlp-main.440) (Yin et al., EMNLP 2022)
ACL