@inproceedings{howcroft-etal-2020-twenty,
title = "Twenty Years of Confusion in Human Evaluation: {NLG} Needs Evaluation Sheets and Standardised Definitions",
author = "Howcroft, David M. and
Belz, Anya and
Clinciu, Miruna-Adriana and
Gkatzia, Dimitra and
Hasan, Sadid A. and
Mahamood, Saad and
Mille, Simon and
van Miltenburg, Emiel and
Santhanam, Sashank and
Rieser, Verena",
editor = "Davis, Brian and
Graham, Yvette and
Kelleher, John and
Sripada, Yaji",
booktitle = "Proceedings of the 13th International Conference on Natural Language Generation",
month = dec,
year = "2020",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.inlg-1.23/",
doi = "10.18653/v1/2020.inlg-1.23",
pages = "169--182",
abstract = "Human assessment remains the most trusted form of evaluation in NLG, but highly diverse approaches and a proliferation of different quality criteria used by researchers make it difficult to compare results and draw conclusions across papers, with adverse implications for meta-evaluation and reproducibility. In this paper, we present (i) our dataset of 165 NLG papers with human evaluations, (ii) the annotation scheme we developed to label the papers for different aspects of evaluations, (iii) quantitative analyses of the annotations, and (iv) a set of recommendations for improving standards in evaluation reporting. We use the annotations as a basis for examining information included in evaluation reports, and levels of consistency in approaches, experimental design and terminology, focusing in particular on the 200+ different terms that have been used for evaluated aspects of quality. We conclude that due to a pervasive lack of clarity in reports and extreme diversity in approaches, human evaluation in NLG presents as extremely confused in 2020, and that the field is in urgent need of standard methods and terminology."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="howcroft-etal-2020-twenty">
<titleInfo>
<title>Twenty Years of Confusion in Human Evaluation: NLG Needs Evaluation Sheets and Standardised Definitions</title>
</titleInfo>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Howcroft</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anya</namePart>
<namePart type="family">Belz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miruna-Adriana</namePart>
<namePart type="family">Clinciu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dimitra</namePart>
<namePart type="family">Gkatzia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sadid</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Hasan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saad</namePart>
<namePart type="family">Mahamood</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Mille</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emiel</namePart>
<namePart type="family">van Miltenburg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sashank</namePart>
<namePart type="family">Santhanam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Verena</namePart>
<namePart type="family">Rieser</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 13th International Conference on Natural Language Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Brian</namePart>
<namePart type="family">Davis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yvette</namePart>
<namePart type="family">Graham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">John</namePart>
<namePart type="family">Kelleher</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yaji</namePart>
<namePart type="family">Sripada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Human assessment remains the most trusted form of evaluation in NLG, but highly diverse approaches and a proliferation of different quality criteria used by researchers make it difficult to compare results and draw conclusions across papers, with adverse implications for meta-evaluation and reproducibility. In this paper, we present (i) our dataset of 165 NLG papers with human evaluations, (ii) the annotation scheme we developed to label the papers for different aspects of evaluations, (iii) quantitative analyses of the annotations, and (iv) a set of recommendations for improving standards in evaluation reporting. We use the annotations as a basis for examining information included in evaluation reports, and levels of consistency in approaches, experimental design and terminology, focusing in particular on the 200+ different terms that have been used for evaluated aspects of quality. We conclude that due to a pervasive lack of clarity in reports and extreme diversity in approaches, human evaluation in NLG presents as extremely confused in 2020, and that the field is in urgent need of standard methods and terminology.</abstract>
<identifier type="citekey">howcroft-etal-2020-twenty</identifier>
<identifier type="doi">10.18653/v1/2020.inlg-1.23</identifier>
<location>
<url>https://aclanthology.org/2020.inlg-1.23/</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>169</start>
<end>182</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Twenty Years of Confusion in Human Evaluation: NLG Needs Evaluation Sheets and Standardised Definitions
%A Howcroft, David M.
%A Belz, Anya
%A Clinciu, Miruna-Adriana
%A Gkatzia, Dimitra
%A Hasan, Sadid A.
%A Mahamood, Saad
%A Mille, Simon
%A van Miltenburg, Emiel
%A Santhanam, Sashank
%A Rieser, Verena
%Y Davis, Brian
%Y Graham, Yvette
%Y Kelleher, John
%Y Sripada, Yaji
%S Proceedings of the 13th International Conference on Natural Language Generation
%D 2020
%8 December
%I Association for Computational Linguistics
%C Dublin, Ireland
%F howcroft-etal-2020-twenty
%X Human assessment remains the most trusted form of evaluation in NLG, but highly diverse approaches and a proliferation of different quality criteria used by researchers make it difficult to compare results and draw conclusions across papers, with adverse implications for meta-evaluation and reproducibility. In this paper, we present (i) our dataset of 165 NLG papers with human evaluations, (ii) the annotation scheme we developed to label the papers for different aspects of evaluations, (iii) quantitative analyses of the annotations, and (iv) a set of recommendations for improving standards in evaluation reporting. We use the annotations as a basis for examining information included in evaluation reports, and levels of consistency in approaches, experimental design and terminology, focusing in particular on the 200+ different terms that have been used for evaluated aspects of quality. We conclude that due to a pervasive lack of clarity in reports and extreme diversity in approaches, human evaluation in NLG presents as extremely confused in 2020, and that the field is in urgent need of standard methods and terminology.
%R 10.18653/v1/2020.inlg-1.23
%U https://aclanthology.org/2020.inlg-1.23/
%U https://doi.org/10.18653/v1/2020.inlg-1.23
%P 169-182
Markdown (Informal)
[Twenty Years of Confusion in Human Evaluation: NLG Needs Evaluation Sheets and Standardised Definitions](https://aclanthology.org/2020.inlg-1.23/) (Howcroft et al., INLG 2020)
ACL
- David M. Howcroft, Anya Belz, Miruna-Adriana Clinciu, Dimitra Gkatzia, Sadid A. Hasan, Saad Mahamood, Simon Mille, Emiel van Miltenburg, Sashank Santhanam, and Verena Rieser. 2020. Twenty Years of Confusion in Human Evaluation: NLG Needs Evaluation Sheets and Standardised Definitions. In Proceedings of the 13th International Conference on Natural Language Generation, pages 169–182, Dublin, Ireland. Association for Computational Linguistics.