@inproceedings{lucas-havens-2023-gpts,
title = "{GPT}s Don{'}t Keep Secrets: Searching for Backdoor Watermark Triggers in Autoregressive Language Models",
author = "Lucas, Evan and
Havens, Timothy",
editor = "Ovalle, Anaelia and
Chang, Kai-Wei and
Mehrabi, Ninareh and
Pruksachatkun, Yada and
Galystan, Aram and
Dhamala, Jwala and
Verma, Apurv and
Cao, Trista and
Kumar, Anoop and
Gupta, Rahul",
booktitle = "Proceedings of the 3rd Workshop on Trustworthy Natural Language Processing (TrustNLP 2023)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.trustnlp-1.21",
doi = "10.18653/v1/2023.trustnlp-1.21",
pages = "242--248",
abstract = "This work analyzes backdoor watermarks in an autoregressive transformer fine-tuned to perform a generative sequence-to-sequence task, specifically summarization. We propose and demonstrate an attack to identify trigger words or phrases by analyzing open ended generations from autoregressive models that have backdoor watermarks inserted. It is shown in our work that triggers based on random common words are easier to identify than those based on single, rare tokens. The attack proposed is easy to implement and only requires access to the model weights. Code used to create the backdoor watermarked models and analyze their outputs is shared at [github link to be inserted for camera ready version].",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lucas-havens-2023-gpts">
<titleInfo>
<title>GPTs Don’t Keep Secrets: Searching for Backdoor Watermark Triggers in Autoregressive Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Evan</namePart>
<namePart type="family">Lucas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Timothy</namePart>
<namePart type="family">Havens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 3rd Workshop on Trustworthy Natural Language Processing (TrustNLP 2023)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anaelia</namePart>
<namePart type="family">Ovalle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai-Wei</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ninareh</namePart>
<namePart type="family">Mehrabi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yada</namePart>
<namePart type="family">Pruksachatkun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aram</namePart>
<namePart type="family">Galystan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jwala</namePart>
<namePart type="family">Dhamala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Apurv</namePart>
<namePart type="family">Verma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Trista</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anoop</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rahul</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This work analyzes backdoor watermarks in an autoregressive transformer fine-tuned to perform a generative sequence-to-sequence task, specifically summarization. We propose and demonstrate an attack to identify trigger words or phrases by analyzing open ended generations from autoregressive models that have backdoor watermarks inserted. It is shown in our work that triggers based on random common words are easier to identify than those based on single, rare tokens. The attack proposed is easy to implement and only requires access to the model weights. Code used to create the backdoor watermarked models and analyze their outputs is shared at [github link to be inserted for camera ready version].</abstract>
<identifier type="citekey">lucas-havens-2023-gpts</identifier>
<identifier type="doi">10.18653/v1/2023.trustnlp-1.21</identifier>
<location>
<url>https://aclanthology.org/2023.trustnlp-1.21</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>242</start>
<end>248</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T GPTs Don’t Keep Secrets: Searching for Backdoor Watermark Triggers in Autoregressive Language Models
%A Lucas, Evan
%A Havens, Timothy
%Y Ovalle, Anaelia
%Y Chang, Kai-Wei
%Y Mehrabi, Ninareh
%Y Pruksachatkun, Yada
%Y Galystan, Aram
%Y Dhamala, Jwala
%Y Verma, Apurv
%Y Cao, Trista
%Y Kumar, Anoop
%Y Gupta, Rahul
%S Proceedings of the 3rd Workshop on Trustworthy Natural Language Processing (TrustNLP 2023)
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F lucas-havens-2023-gpts
%X This work analyzes backdoor watermarks in an autoregressive transformer fine-tuned to perform a generative sequence-to-sequence task, specifically summarization. We propose and demonstrate an attack to identify trigger words or phrases by analyzing open ended generations from autoregressive models that have backdoor watermarks inserted. It is shown in our work that triggers based on random common words are easier to identify than those based on single, rare tokens. The attack proposed is easy to implement and only requires access to the model weights. Code used to create the backdoor watermarked models and analyze their outputs is shared at [github link to be inserted for camera ready version].
%R 10.18653/v1/2023.trustnlp-1.21
%U https://aclanthology.org/2023.trustnlp-1.21
%U https://doi.org/10.18653/v1/2023.trustnlp-1.21
%P 242-248
Markdown (Informal)
[GPTs Don’t Keep Secrets: Searching for Backdoor Watermark Triggers in Autoregressive Language Models](https://aclanthology.org/2023.trustnlp-1.21) (Lucas & Havens, TrustNLP 2023)
ACL