@article{thomson-etal-2024-common,
title = "Common Flaws in Running Human Evaluation Experiments in {NLP}",
author = "Thomson, Craig and
Reiter, Ehud and
Belz, Anya",
journal = "Computational Linguistics",
volume = "50",
number = "2",
month = jun,
year = "2024",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2024.cl-2.9/",
doi = "10.1162/coli_a_00508",
pages = "795--805",
abstract = "While conducting a coordinated set of repeat runs of human evaluation experiments in NLP, we discovered flaws in every single experiment we selected for inclusion via a systematic process. In this squib, we describe the types of flaws we discovered, which include coding errors (e.g., loading the wrong system outputs to evaluate), failure to follow standard scientific practice (e.g., ad hoc exclusion of participants and responses), and mistakes in reported numerical results (e.g., reported numbers not matching experimental data). If these problems are widespread, it would have worrying implications for the rigor of NLP evaluation experiments as currently conducted. We discuss what researchers can do to reduce the occurrence of such flaws, including pre-registration, better code development practices, increased testing and piloting, and post-publication addressing of errors."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="thomson-etal-2024-common">
<titleInfo>
<title>Common Flaws in Running Human Evaluation Experiments in NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Craig</namePart>
<namePart type="family">Thomson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ehud</namePart>
<namePart type="family">Reiter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anya</namePart>
<namePart type="family">Belz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>While conducting a coordinated set of repeat runs of human evaluation experiments in NLP, we discovered flaws in every single experiment we selected for inclusion via a systematic process. In this squib, we describe the types of flaws we discovered, which include coding errors (e.g., loading the wrong system outputs to evaluate), failure to follow standard scientific practice (e.g., ad hoc exclusion of participants and responses), and mistakes in reported numerical results (e.g., reported numbers not matching experimental data). If these problems are widespread, it would have worrying implications for the rigor of NLP evaluation experiments as currently conducted. We discuss what researchers can do to reduce the occurrence of such flaws, including pre-registration, better code development practices, increased testing and piloting, and post-publication addressing of errors.</abstract>
<identifier type="citekey">thomson-etal-2024-common</identifier>
<identifier type="doi">10.1162/coli_a_00508</identifier>
<location>
<url>https://aclanthology.org/2024.cl-2.9/</url>
</location>
<part>
<date>2024-06</date>
<detail type="volume"><number>50</number></detail>
<detail type="issue"><number>2</number></detail>
<extent unit="page">
<start>795</start>
<end>805</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Common Flaws in Running Human Evaluation Experiments in NLP
%A Thomson, Craig
%A Reiter, Ehud
%A Belz, Anya
%J Computational Linguistics
%D 2024
%8 June
%V 50
%N 2
%I MIT Press
%C Cambridge, MA
%F thomson-etal-2024-common
%X While conducting a coordinated set of repeat runs of human evaluation experiments in NLP, we discovered flaws in every single experiment we selected for inclusion via a systematic process. In this squib, we describe the types of flaws we discovered, which include coding errors (e.g., loading the wrong system outputs to evaluate), failure to follow standard scientific practice (e.g., ad hoc exclusion of participants and responses), and mistakes in reported numerical results (e.g., reported numbers not matching experimental data). If these problems are widespread, it would have worrying implications for the rigor of NLP evaluation experiments as currently conducted. We discuss what researchers can do to reduce the occurrence of such flaws, including pre-registration, better code development practices, increased testing and piloting, and post-publication addressing of errors.
%R 10.1162/coli_a_00508
%U https://aclanthology.org/2024.cl-2.9/
%U https://doi.org/10.1162/coli_a_00508
%P 795-805
Markdown (Informal)
[Common Flaws in Running Human Evaluation Experiments in NLP](https://aclanthology.org/2024.cl-2.9/) (Thomson et al., CL 2024)
ACL