@inproceedings{prabhumoye-etal-2023-adding,
title = "Adding Instructions during Pretraining: Effective way of Controlling Toxicity in Language Models",
author = "Prabhumoye, Shrimai and
Patwary, Mostofa and
Shoeybi, Mohammad and
Catanzaro, Bryan",
editor = "Vlachos, Andreas and
Augenstein, Isabelle",
booktitle = "Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.eacl-main.193/",
doi = "10.18653/v1/2023.eacl-main.193",
pages = "2636--2651",
abstract = "Pretrained large language models have become indispensable for solving various natural language processing (NLP) tasks. However, safely deploying them in real world applications is challenging because they generate toxic content. To address this challenge, we propose two novel pretraining data augmentation strategies that significantly reduce model toxicity without compromising its utility. Our two strategies are: (1) MEDA: adds raw toxicity score as meta-data to the pretraining samples, and (2) INST: adds instructions to those samples indicating their toxicity. Our results indicate that our best performing strategy (INST) substantially reduces the toxicity probability up to 61{\%} while preserving the accuracy on five benchmark NLP tasks as well as improving AUC scores on four bias detection tasks by 1.3{\%}. We also demonstrate the generalizability of our techniques by scaling the number of training samples and the number of model parameters."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="prabhumoye-etal-2023-adding">
<titleInfo>
<title>Adding Instructions during Pretraining: Effective way of Controlling Toxicity in Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shrimai</namePart>
<namePart type="family">Prabhumoye</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mostofa</namePart>
<namePart type="family">Patwary</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="family">Shoeybi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bryan</namePart>
<namePart type="family">Catanzaro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Vlachos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isabelle</namePart>
<namePart type="family">Augenstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dubrovnik, Croatia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Pretrained large language models have become indispensable for solving various natural language processing (NLP) tasks. However, safely deploying them in real world applications is challenging because they generate toxic content. To address this challenge, we propose two novel pretraining data augmentation strategies that significantly reduce model toxicity without compromising its utility. Our two strategies are: (1) MEDA: adds raw toxicity score as meta-data to the pretraining samples, and (2) INST: adds instructions to those samples indicating their toxicity. Our results indicate that our best performing strategy (INST) substantially reduces the toxicity probability up to 61% while preserving the accuracy on five benchmark NLP tasks as well as improving AUC scores on four bias detection tasks by 1.3%. We also demonstrate the generalizability of our techniques by scaling the number of training samples and the number of model parameters.</abstract>
<identifier type="citekey">prabhumoye-etal-2023-adding</identifier>
<identifier type="doi">10.18653/v1/2023.eacl-main.193</identifier>
<location>
<url>https://aclanthology.org/2023.eacl-main.193/</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>2636</start>
<end>2651</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Adding Instructions during Pretraining: Effective way of Controlling Toxicity in Language Models
%A Prabhumoye, Shrimai
%A Patwary, Mostofa
%A Shoeybi, Mohammad
%A Catanzaro, Bryan
%Y Vlachos, Andreas
%Y Augenstein, Isabelle
%S Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics
%D 2023
%8 May
%I Association for Computational Linguistics
%C Dubrovnik, Croatia
%F prabhumoye-etal-2023-adding
%X Pretrained large language models have become indispensable for solving various natural language processing (NLP) tasks. However, safely deploying them in real world applications is challenging because they generate toxic content. To address this challenge, we propose two novel pretraining data augmentation strategies that significantly reduce model toxicity without compromising its utility. Our two strategies are: (1) MEDA: adds raw toxicity score as meta-data to the pretraining samples, and (2) INST: adds instructions to those samples indicating their toxicity. Our results indicate that our best performing strategy (INST) substantially reduces the toxicity probability up to 61% while preserving the accuracy on five benchmark NLP tasks as well as improving AUC scores on four bias detection tasks by 1.3%. We also demonstrate the generalizability of our techniques by scaling the number of training samples and the number of model parameters.
%R 10.18653/v1/2023.eacl-main.193
%U https://aclanthology.org/2023.eacl-main.193/
%U https://doi.org/10.18653/v1/2023.eacl-main.193
%P 2636-2651
Markdown (Informal)
[Adding Instructions during Pretraining: Effective way of Controlling Toxicity in Language Models](https://aclanthology.org/2023.eacl-main.193/) (Prabhumoye et al., EACL 2023)
ACL