@inproceedings{meade-etal-2023-using,
title = "Using In-Context Learning to Improve Dialogue Safety",
author = "Meade, Nicholas and
Gella, Spandana and
Hazarika, Devamanyu and
Gupta, Prakhar and
Jin, Di and
Reddy, Siva and
Liu, Yang and
Hakkani-Tur, Dilek",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-emnlp.796/",
doi = "10.18653/v1/2023.findings-emnlp.796",
pages = "11882--11910",
abstract = "While large neural-based conversational models have become increasingly proficient dialogue agents, recent work has highlighted safety issues with these systems. For example, these systems can be goaded into generating toxic content, often perpetuating social biases or stereotypes. We investigate a retrieval-based approach for reducing bias and toxicity in responses from chatbots. It uses in-context learning to steer a model towards safer generations. Concretely, to generate a response to an unsafe dialogue context, we retrieve demonstrations of safe responses to similar dialogue contexts. We find our method performs competitively with existing approaches to dialogue safety without requiring training. We also show, using automatic and human evaluation, that reductions in toxicity obtained using our approach are not at the cost engagingness or coherency. Finally, we note our method can be used in compliment to existing dialogue safety approaches, such as RLHF."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="meade-etal-2023-using">
<titleInfo>
<title>Using In-Context Learning to Improve Dialogue Safety</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicholas</namePart>
<namePart type="family">Meade</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Spandana</namePart>
<namePart type="family">Gella</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Devamanyu</namePart>
<namePart type="family">Hazarika</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Prakhar</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Di</namePart>
<namePart type="family">Jin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Siva</namePart>
<namePart type="family">Reddy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dilek</namePart>
<namePart type="family">Hakkani-Tur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>While large neural-based conversational models have become increasingly proficient dialogue agents, recent work has highlighted safety issues with these systems. For example, these systems can be goaded into generating toxic content, often perpetuating social biases or stereotypes. We investigate a retrieval-based approach for reducing bias and toxicity in responses from chatbots. It uses in-context learning to steer a model towards safer generations. Concretely, to generate a response to an unsafe dialogue context, we retrieve demonstrations of safe responses to similar dialogue contexts. We find our method performs competitively with existing approaches to dialogue safety without requiring training. We also show, using automatic and human evaluation, that reductions in toxicity obtained using our approach are not at the cost engagingness or coherency. Finally, we note our method can be used in compliment to existing dialogue safety approaches, such as RLHF.</abstract>
<identifier type="citekey">meade-etal-2023-using</identifier>
<identifier type="doi">10.18653/v1/2023.findings-emnlp.796</identifier>
<location>
<url>https://aclanthology.org/2023.findings-emnlp.796/</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>11882</start>
<end>11910</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Using In-Context Learning to Improve Dialogue Safety
%A Meade, Nicholas
%A Gella, Spandana
%A Hazarika, Devamanyu
%A Gupta, Prakhar
%A Jin, Di
%A Reddy, Siva
%A Liu, Yang
%A Hakkani-Tur, Dilek
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Findings of the Association for Computational Linguistics: EMNLP 2023
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F meade-etal-2023-using
%X While large neural-based conversational models have become increasingly proficient dialogue agents, recent work has highlighted safety issues with these systems. For example, these systems can be goaded into generating toxic content, often perpetuating social biases or stereotypes. We investigate a retrieval-based approach for reducing bias and toxicity in responses from chatbots. It uses in-context learning to steer a model towards safer generations. Concretely, to generate a response to an unsafe dialogue context, we retrieve demonstrations of safe responses to similar dialogue contexts. We find our method performs competitively with existing approaches to dialogue safety without requiring training. We also show, using automatic and human evaluation, that reductions in toxicity obtained using our approach are not at the cost engagingness or coherency. Finally, we note our method can be used in compliment to existing dialogue safety approaches, such as RLHF.
%R 10.18653/v1/2023.findings-emnlp.796
%U https://aclanthology.org/2023.findings-emnlp.796/
%U https://doi.org/10.18653/v1/2023.findings-emnlp.796
%P 11882-11910
Markdown (Informal)
[Using In-Context Learning to Improve Dialogue Safety](https://aclanthology.org/2023.findings-emnlp.796/) (Meade et al., Findings 2023)
ACL
- Nicholas Meade, Spandana Gella, Devamanyu Hazarika, Prakhar Gupta, Di Jin, Siva Reddy, Yang Liu, and Dilek Hakkani-Tur. 2023. Using In-Context Learning to Improve Dialogue Safety. In Findings of the Association for Computational Linguistics: EMNLP 2023, pages 11882–11910, Singapore. Association for Computational Linguistics.