@inproceedings{niu-etal-2024-parameter,
title = "Parameter-Efficient Detoxification with Contrastive Decoding",
author = "Niu, Tong and
Xiong, Caiming and
Zhou, Yingbo and
Yavuz, Semih",
editor = "Soni, Nikita and
Flek, Lucie and
Sharma, Ashish and
Yang, Diyi and
Hooker, Sara and
Schwartz, H. Andrew",
booktitle = "Proceedings of the 1st Human-Centered Large Language Modeling Workshop",
month = aug,
year = "2024",
address = "TBD",
publisher = "ACL",
url = "https://aclanthology.org/2024.hucllm-1.3/",
doi = "10.18653/v1/2024.hucllm-1.3",
pages = "30--40",
abstract = "The field of natural language generation has witnessed significant advancements in recent years, including the development of controllable text generation techniques. However, controlling the attributes of the generated text remains a challenge, especially when aiming to avoid undesirable behavior such as toxicity. In this work, we introduce Detoxification Generator (DETOXIGEN), an inference-time algorithm that steers the generation away from unwanted styles. DETOXIGEN is an ensemble of a pre-trained language model (generator) and a detoxifier. The detoxifier is trained intentionally on the toxic data representative of the undesirable attribute, encouraging it to generate text in that style exclusively. During the actual generation, we use the trained detoxifier to produce undesirable tokens for the generator to contrast against at each decoding step. This approach directly informs the generator to avoid generating tokens that the detoxifier considers highly likely. We evaluate DETOXIGEN on the commonly used REALTOXICITYPROMPTS benchmark (Gehman et al., 2020) with various language models as generators. We find that it significantly outperforms previous approaches in detoxification metrics while not compromising on the generation quality. Moreover, the detoxifier is obtained by soft prompt-tuning using the same backbone language model as the generator. Hence, DETOXIGEN requires only a tiny amount of extra weights from the virtual tokens of the detoxifier to be loaded into GPU memory while decoding, making it a promising lightweight, practical, and parameter-efficient detoxification strategy."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="niu-etal-2024-parameter">
<titleInfo>
<title>Parameter-Efficient Detoxification with Contrastive Decoding</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tong</namePart>
<namePart type="family">Niu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Caiming</namePart>
<namePart type="family">Xiong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yingbo</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Semih</namePart>
<namePart type="family">Yavuz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Human-Centered Large Language Modeling Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nikita</namePart>
<namePart type="family">Soni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucie</namePart>
<namePart type="family">Flek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ashish</namePart>
<namePart type="family">Sharma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diyi</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Hooker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">H</namePart>
<namePart type="given">Andrew</namePart>
<namePart type="family">Schwartz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ACL</publisher>
<place>
<placeTerm type="text">TBD</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The field of natural language generation has witnessed significant advancements in recent years, including the development of controllable text generation techniques. However, controlling the attributes of the generated text remains a challenge, especially when aiming to avoid undesirable behavior such as toxicity. In this work, we introduce Detoxification Generator (DETOXIGEN), an inference-time algorithm that steers the generation away from unwanted styles. DETOXIGEN is an ensemble of a pre-trained language model (generator) and a detoxifier. The detoxifier is trained intentionally on the toxic data representative of the undesirable attribute, encouraging it to generate text in that style exclusively. During the actual generation, we use the trained detoxifier to produce undesirable tokens for the generator to contrast against at each decoding step. This approach directly informs the generator to avoid generating tokens that the detoxifier considers highly likely. We evaluate DETOXIGEN on the commonly used REALTOXICITYPROMPTS benchmark (Gehman et al., 2020) with various language models as generators. We find that it significantly outperforms previous approaches in detoxification metrics while not compromising on the generation quality. Moreover, the detoxifier is obtained by soft prompt-tuning using the same backbone language model as the generator. Hence, DETOXIGEN requires only a tiny amount of extra weights from the virtual tokens of the detoxifier to be loaded into GPU memory while decoding, making it a promising lightweight, practical, and parameter-efficient detoxification strategy.</abstract>
<identifier type="citekey">niu-etal-2024-parameter</identifier>
<identifier type="doi">10.18653/v1/2024.hucllm-1.3</identifier>
<location>
<url>https://aclanthology.org/2024.hucllm-1.3/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>30</start>
<end>40</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Parameter-Efficient Detoxification with Contrastive Decoding
%A Niu, Tong
%A Xiong, Caiming
%A Zhou, Yingbo
%A Yavuz, Semih
%Y Soni, Nikita
%Y Flek, Lucie
%Y Sharma, Ashish
%Y Yang, Diyi
%Y Hooker, Sara
%Y Schwartz, H. Andrew
%S Proceedings of the 1st Human-Centered Large Language Modeling Workshop
%D 2024
%8 August
%I ACL
%C TBD
%F niu-etal-2024-parameter
%X The field of natural language generation has witnessed significant advancements in recent years, including the development of controllable text generation techniques. However, controlling the attributes of the generated text remains a challenge, especially when aiming to avoid undesirable behavior such as toxicity. In this work, we introduce Detoxification Generator (DETOXIGEN), an inference-time algorithm that steers the generation away from unwanted styles. DETOXIGEN is an ensemble of a pre-trained language model (generator) and a detoxifier. The detoxifier is trained intentionally on the toxic data representative of the undesirable attribute, encouraging it to generate text in that style exclusively. During the actual generation, we use the trained detoxifier to produce undesirable tokens for the generator to contrast against at each decoding step. This approach directly informs the generator to avoid generating tokens that the detoxifier considers highly likely. We evaluate DETOXIGEN on the commonly used REALTOXICITYPROMPTS benchmark (Gehman et al., 2020) with various language models as generators. We find that it significantly outperforms previous approaches in detoxification metrics while not compromising on the generation quality. Moreover, the detoxifier is obtained by soft prompt-tuning using the same backbone language model as the generator. Hence, DETOXIGEN requires only a tiny amount of extra weights from the virtual tokens of the detoxifier to be loaded into GPU memory while decoding, making it a promising lightweight, practical, and parameter-efficient detoxification strategy.
%R 10.18653/v1/2024.hucllm-1.3
%U https://aclanthology.org/2024.hucllm-1.3/
%U https://doi.org/10.18653/v1/2024.hucllm-1.3
%P 30-40
Markdown (Informal)
[Parameter-Efficient Detoxification with Contrastive Decoding](https://aclanthology.org/2024.hucllm-1.3/) (Niu et al., HuCLLM 2024)
ACL