@inproceedings{hansen-sogaard-2021-guideline,
title = "Guideline Bias in {W}izard-of-{O}z Dialogues",
author = "Hansen, Victor Petr{\'e}n Bach and
S{\o}gaard, Anders",
editor = "Church, Kenneth and
Liberman, Mark and
Kordoni, Valia",
booktitle = "Proceedings of the 1st Workshop on Benchmarking: Past, Present and Future",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.bppf-1.2/",
doi = "10.18653/v1/2021.bppf-1.2",
pages = "8--14",
abstract = "NLP models struggle with generalization due to sampling and annotator bias. This paper focuses on a different kind of bias that has received very little attention: guideline bias, i.e., the bias introduced by how our annotator guidelines are formulated. We examine two recently introduced dialogue datasets, CCPE-M and Taskmaster-1, both collected by trained assistants in a Wizard-of-Oz set-up. For CCPE-M, we show how a simple lexical bias for the word like in the guidelines biases the data collection. This bias, in effect, leads to poor performance on data without this bias: a preference elicitation architecture based on BERT suffers a 5.3{\%} absolute drop in performance, when like is replaced with a synonymous phrase, and a 13.2{\%} drop in performance when evaluated on out-of-sample data. For Taskmaster-1, we show how the order in which instructions are resented, biases the data collection."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hansen-sogaard-2021-guideline">
<titleInfo>
<title>Guideline Bias in Wizard-of-Oz Dialogues</title>
</titleInfo>
<name type="personal">
<namePart type="given">Victor</namePart>
<namePart type="given">Petrén</namePart>
<namePart type="given">Bach</namePart>
<namePart type="family">Hansen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anders</namePart>
<namePart type="family">Søgaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Benchmarking: Past, Present and Future</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kenneth</namePart>
<namePart type="family">Church</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Liberman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Valia</namePart>
<namePart type="family">Kordoni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>NLP models struggle with generalization due to sampling and annotator bias. This paper focuses on a different kind of bias that has received very little attention: guideline bias, i.e., the bias introduced by how our annotator guidelines are formulated. We examine two recently introduced dialogue datasets, CCPE-M and Taskmaster-1, both collected by trained assistants in a Wizard-of-Oz set-up. For CCPE-M, we show how a simple lexical bias for the word like in the guidelines biases the data collection. This bias, in effect, leads to poor performance on data without this bias: a preference elicitation architecture based on BERT suffers a 5.3% absolute drop in performance, when like is replaced with a synonymous phrase, and a 13.2% drop in performance when evaluated on out-of-sample data. For Taskmaster-1, we show how the order in which instructions are resented, biases the data collection.</abstract>
<identifier type="citekey">hansen-sogaard-2021-guideline</identifier>
<identifier type="doi">10.18653/v1/2021.bppf-1.2</identifier>
<location>
<url>https://aclanthology.org/2021.bppf-1.2/</url>
</location>
<part>
<date>2021-08</date>
<extent unit="page">
<start>8</start>
<end>14</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Guideline Bias in Wizard-of-Oz Dialogues
%A Hansen, Victor Petrén Bach
%A Søgaard, Anders
%Y Church, Kenneth
%Y Liberman, Mark
%Y Kordoni, Valia
%S Proceedings of the 1st Workshop on Benchmarking: Past, Present and Future
%D 2021
%8 August
%I Association for Computational Linguistics
%C Online
%F hansen-sogaard-2021-guideline
%X NLP models struggle with generalization due to sampling and annotator bias. This paper focuses on a different kind of bias that has received very little attention: guideline bias, i.e., the bias introduced by how our annotator guidelines are formulated. We examine two recently introduced dialogue datasets, CCPE-M and Taskmaster-1, both collected by trained assistants in a Wizard-of-Oz set-up. For CCPE-M, we show how a simple lexical bias for the word like in the guidelines biases the data collection. This bias, in effect, leads to poor performance on data without this bias: a preference elicitation architecture based on BERT suffers a 5.3% absolute drop in performance, when like is replaced with a synonymous phrase, and a 13.2% drop in performance when evaluated on out-of-sample data. For Taskmaster-1, we show how the order in which instructions are resented, biases the data collection.
%R 10.18653/v1/2021.bppf-1.2
%U https://aclanthology.org/2021.bppf-1.2/
%U https://doi.org/10.18653/v1/2021.bppf-1.2
%P 8-14
Markdown (Informal)
[Guideline Bias in Wizard-of-Oz Dialogues](https://aclanthology.org/2021.bppf-1.2/) (Hansen & Søgaard, BPPF 2021)
ACL
- Victor Petrén Bach Hansen and Anders Søgaard. 2021. Guideline Bias in Wizard-of-Oz Dialogues. In Proceedings of the 1st Workshop on Benchmarking: Past, Present and Future, pages 8–14, Online. Association for Computational Linguistics.