@inproceedings{basile-etal-2021-need,
title = "We Need to Consider Disagreement in Evaluation",
author = "Basile, Valerio and
Fell, Michael and
Fornaciari, Tommaso and
Hovy, Dirk and
Paun, Silviu and
Plank, Barbara and
Poesio, Massimo and
Uma, Alexandra",
editor = "Church, Kenneth and
Liberman, Mark and
Kordoni, Valia",
booktitle = "Proceedings of the 1st Workshop on Benchmarking: Past, Present and Future",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.bppf-1.3/",
doi = "10.18653/v1/2021.bppf-1.3",
pages = "15--21",
abstract = "Evaluation is of paramount importance in data-driven research fields such as Natural Language Processing (NLP) and Computer Vision (CV). Current evaluation practice largely hinges on the existence of a single {\textquotedblleft}ground truth{\textquotedblright} against which we can meaningfully compare the prediction of a model. However, this comparison is flawed for two reasons. 1) In many cases, more than one answer is correct. 2) Even where there is a single answer, disagreement among annotators is ubiquitous, making it difficult to decide on a gold standard. We argue that the current methods of adjudication, agreement, and evaluation need serious reconsideration. Some researchers now propose to minimize disagreement and to fix datasets. We argue that this is a gross oversimplification, and likely to conceal the underlying complexity. Instead, we suggest that we need to better capture the sources of disagreement to improve today`s evaluation practice. We discuss three sources of disagreement: from the annotator, the data, and the context, and show how this affects even seemingly objective tasks. Datasets with multiple annotations are becoming more common, as are methods to integrate disagreement into modeling. The logical next step is to extend this to evaluation."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="basile-etal-2021-need">
<titleInfo>
<title>We Need to Consider Disagreement in Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Valerio</namePart>
<namePart type="family">Basile</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Fell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tommaso</namePart>
<namePart type="family">Fornaciari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dirk</namePart>
<namePart type="family">Hovy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Silviu</namePart>
<namePart type="family">Paun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="family">Plank</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Massimo</namePart>
<namePart type="family">Poesio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexandra</namePart>
<namePart type="family">Uma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Benchmarking: Past, Present and Future</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kenneth</namePart>
<namePart type="family">Church</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Liberman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Valia</namePart>
<namePart type="family">Kordoni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Evaluation is of paramount importance in data-driven research fields such as Natural Language Processing (NLP) and Computer Vision (CV). Current evaluation practice largely hinges on the existence of a single “ground truth” against which we can meaningfully compare the prediction of a model. However, this comparison is flawed for two reasons. 1) In many cases, more than one answer is correct. 2) Even where there is a single answer, disagreement among annotators is ubiquitous, making it difficult to decide on a gold standard. We argue that the current methods of adjudication, agreement, and evaluation need serious reconsideration. Some researchers now propose to minimize disagreement and to fix datasets. We argue that this is a gross oversimplification, and likely to conceal the underlying complexity. Instead, we suggest that we need to better capture the sources of disagreement to improve today‘s evaluation practice. We discuss three sources of disagreement: from the annotator, the data, and the context, and show how this affects even seemingly objective tasks. Datasets with multiple annotations are becoming more common, as are methods to integrate disagreement into modeling. The logical next step is to extend this to evaluation.</abstract>
<identifier type="citekey">basile-etal-2021-need</identifier>
<identifier type="doi">10.18653/v1/2021.bppf-1.3</identifier>
<location>
<url>https://aclanthology.org/2021.bppf-1.3/</url>
</location>
<part>
<date>2021-08</date>
<extent unit="page">
<start>15</start>
<end>21</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T We Need to Consider Disagreement in Evaluation
%A Basile, Valerio
%A Fell, Michael
%A Fornaciari, Tommaso
%A Hovy, Dirk
%A Paun, Silviu
%A Plank, Barbara
%A Poesio, Massimo
%A Uma, Alexandra
%Y Church, Kenneth
%Y Liberman, Mark
%Y Kordoni, Valia
%S Proceedings of the 1st Workshop on Benchmarking: Past, Present and Future
%D 2021
%8 August
%I Association for Computational Linguistics
%C Online
%F basile-etal-2021-need
%X Evaluation is of paramount importance in data-driven research fields such as Natural Language Processing (NLP) and Computer Vision (CV). Current evaluation practice largely hinges on the existence of a single “ground truth” against which we can meaningfully compare the prediction of a model. However, this comparison is flawed for two reasons. 1) In many cases, more than one answer is correct. 2) Even where there is a single answer, disagreement among annotators is ubiquitous, making it difficult to decide on a gold standard. We argue that the current methods of adjudication, agreement, and evaluation need serious reconsideration. Some researchers now propose to minimize disagreement and to fix datasets. We argue that this is a gross oversimplification, and likely to conceal the underlying complexity. Instead, we suggest that we need to better capture the sources of disagreement to improve today‘s evaluation practice. We discuss three sources of disagreement: from the annotator, the data, and the context, and show how this affects even seemingly objective tasks. Datasets with multiple annotations are becoming more common, as are methods to integrate disagreement into modeling. The logical next step is to extend this to evaluation.
%R 10.18653/v1/2021.bppf-1.3
%U https://aclanthology.org/2021.bppf-1.3/
%U https://doi.org/10.18653/v1/2021.bppf-1.3
%P 15-21
Markdown (Informal)
[We Need to Consider Disagreement in Evaluation](https://aclanthology.org/2021.bppf-1.3/) (Basile et al., BPPF 2021)
ACL
- Valerio Basile, Michael Fell, Tommaso Fornaciari, Dirk Hovy, Silviu Paun, Barbara Plank, Massimo Poesio, and Alexandra Uma. 2021. We Need to Consider Disagreement in Evaluation. In Proceedings of the 1st Workshop on Benchmarking: Past, Present and Future, pages 15–21, Online. Association for Computational Linguistics.