@inproceedings{kim-etal-2024-robust,
title = "Robust Safety Classifier Against Jailbreaking Attacks: Adversarial Prompt Shield",
author = "Kim, Jinhwa and
Derakhshan, Ali and
Harris, Ian",
editor = {Chung, Yi-Ling and
Talat, Zeerak and
Nozza, Debora and
Plaza-del-Arco, Flor Miriam and
R{\"o}ttger, Paul and
Mostafazadeh Davani, Aida and
Calabrese, Agostina},
booktitle = "Proceedings of the 8th Workshop on Online Abuse and Harms (WOAH 2024)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.woah-1.12/",
doi = "10.18653/v1/2024.woah-1.12",
pages = "159--170",
abstract = "Large Language Models' safety remains a critical concern due to their vulnerability to jailbreaking attacks, which can prompt these systems to produce harmful and malicious responses. Safety classifiers, computational models trained to discern and mitigate potentially harmful, offensive, or unethical outputs, offer a practical solution to address this issue. However, despite their potential, existing safety classifiers often fail when exposed to adversarial attacks such as gradient-optimized suffix attacks. In response, our study introduces Adversarial Prompt Shield (APS), a lightweight safety classifier model that excels in detection accuracy and demonstrates resilience against unseen jailbreaking prompts. We also introduce efficiently generated adversarial training datasets, named Bot Adversarial Noisy Dialogue (BAND), which are designed to fortify the classifier`s robustness. Through extensive testing on various safety tasks and unseen jailbreaking attacks, we demonstrate the effectiveness and resilience of our models. Evaluations show that our classifier has the potential to significantly reduce the Attack Success Rate by up to 44.9{\%}. This advance paves the way for the next generation of more reliable and resilient Large Language Models."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kim-etal-2024-robust">
<titleInfo>
<title>Robust Safety Classifier Against Jailbreaking Attacks: Adversarial Prompt Shield</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jinhwa</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ali</namePart>
<namePart type="family">Derakhshan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ian</namePart>
<namePart type="family">Harris</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 8th Workshop on Online Abuse and Harms (WOAH 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yi-Ling</namePart>
<namePart type="family">Chung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zeerak</namePart>
<namePart type="family">Talat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Debora</namePart>
<namePart type="family">Nozza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Flor</namePart>
<namePart type="given">Miriam</namePart>
<namePart type="family">Plaza-del-Arco</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Röttger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aida</namePart>
<namePart type="family">Mostafazadeh Davani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Agostina</namePart>
<namePart type="family">Calabrese</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large Language Models’ safety remains a critical concern due to their vulnerability to jailbreaking attacks, which can prompt these systems to produce harmful and malicious responses. Safety classifiers, computational models trained to discern and mitigate potentially harmful, offensive, or unethical outputs, offer a practical solution to address this issue. However, despite their potential, existing safety classifiers often fail when exposed to adversarial attacks such as gradient-optimized suffix attacks. In response, our study introduces Adversarial Prompt Shield (APS), a lightweight safety classifier model that excels in detection accuracy and demonstrates resilience against unseen jailbreaking prompts. We also introduce efficiently generated adversarial training datasets, named Bot Adversarial Noisy Dialogue (BAND), which are designed to fortify the classifier‘s robustness. Through extensive testing on various safety tasks and unseen jailbreaking attacks, we demonstrate the effectiveness and resilience of our models. Evaluations show that our classifier has the potential to significantly reduce the Attack Success Rate by up to 44.9%. This advance paves the way for the next generation of more reliable and resilient Large Language Models.</abstract>
<identifier type="citekey">kim-etal-2024-robust</identifier>
<identifier type="doi">10.18653/v1/2024.woah-1.12</identifier>
<location>
<url>https://aclanthology.org/2024.woah-1.12/</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>159</start>
<end>170</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Robust Safety Classifier Against Jailbreaking Attacks: Adversarial Prompt Shield
%A Kim, Jinhwa
%A Derakhshan, Ali
%A Harris, Ian
%Y Chung, Yi-Ling
%Y Talat, Zeerak
%Y Nozza, Debora
%Y Plaza-del-Arco, Flor Miriam
%Y Röttger, Paul
%Y Mostafazadeh Davani, Aida
%Y Calabrese, Agostina
%S Proceedings of the 8th Workshop on Online Abuse and Harms (WOAH 2024)
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F kim-etal-2024-robust
%X Large Language Models’ safety remains a critical concern due to their vulnerability to jailbreaking attacks, which can prompt these systems to produce harmful and malicious responses. Safety classifiers, computational models trained to discern and mitigate potentially harmful, offensive, or unethical outputs, offer a practical solution to address this issue. However, despite their potential, existing safety classifiers often fail when exposed to adversarial attacks such as gradient-optimized suffix attacks. In response, our study introduces Adversarial Prompt Shield (APS), a lightweight safety classifier model that excels in detection accuracy and demonstrates resilience against unseen jailbreaking prompts. We also introduce efficiently generated adversarial training datasets, named Bot Adversarial Noisy Dialogue (BAND), which are designed to fortify the classifier‘s robustness. Through extensive testing on various safety tasks and unseen jailbreaking attacks, we demonstrate the effectiveness and resilience of our models. Evaluations show that our classifier has the potential to significantly reduce the Attack Success Rate by up to 44.9%. This advance paves the way for the next generation of more reliable and resilient Large Language Models.
%R 10.18653/v1/2024.woah-1.12
%U https://aclanthology.org/2024.woah-1.12/
%U https://doi.org/10.18653/v1/2024.woah-1.12
%P 159-170
Markdown (Informal)
[Robust Safety Classifier Against Jailbreaking Attacks: Adversarial Prompt Shield](https://aclanthology.org/2024.woah-1.12/) (Kim et al., WOAH 2024)
ACL