@inproceedings{howcroft-rieser-2021-happens,
title = "What happens if you treat ordinal ratings as interval data? Human evaluations in {NLP} are even more under-powered than you think",
author = "Howcroft, David M. and
Rieser, Verena",
editor = "Moens, Marie-Francine and
Huang, Xuanjing and
Specia, Lucia and
Yih, Scott Wen-tau",
booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2021",
address = "Online and Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.emnlp-main.703/",
doi = "10.18653/v1/2021.emnlp-main.703",
pages = "8932--8939",
abstract = "Previous work has shown that human evaluations in NLP are notoriously under-powered. Here, we argue that there are two common factors which make this problem even worse: NLP studies usually (a) treat ordinal data as interval data and (b) operate under high variance settings while the differences they are hoping to detect are often subtle. We demonstrate through simulation that ordinal mixed effects models are better able to detect small differences between models, especially in high variance settings common in evaluations of generated texts. We release tools for researchers to conduct their own power analysis and test their assumptions. We also make recommendations for improving statistical power."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="howcroft-rieser-2021-happens">
<titleInfo>
<title>What happens if you treat ordinal ratings as interval data? Human evaluations in NLP are even more under-powered than you think</title>
</titleInfo>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Howcroft</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Verena</namePart>
<namePart type="family">Rieser</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marie-Francine</namePart>
<namePart type="family">Moens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuanjing</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="family">Specia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Scott</namePart>
<namePart type="given">Wen-tau</namePart>
<namePart type="family">Yih</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online and Punta Cana, Dominican Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Previous work has shown that human evaluations in NLP are notoriously under-powered. Here, we argue that there are two common factors which make this problem even worse: NLP studies usually (a) treat ordinal data as interval data and (b) operate under high variance settings while the differences they are hoping to detect are often subtle. We demonstrate through simulation that ordinal mixed effects models are better able to detect small differences between models, especially in high variance settings common in evaluations of generated texts. We release tools for researchers to conduct their own power analysis and test their assumptions. We also make recommendations for improving statistical power.</abstract>
<identifier type="citekey">howcroft-rieser-2021-happens</identifier>
<identifier type="doi">10.18653/v1/2021.emnlp-main.703</identifier>
<location>
<url>https://aclanthology.org/2021.emnlp-main.703/</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>8932</start>
<end>8939</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T What happens if you treat ordinal ratings as interval data? Human evaluations in NLP are even more under-powered than you think
%A Howcroft, David M.
%A Rieser, Verena
%Y Moens, Marie-Francine
%Y Huang, Xuanjing
%Y Specia, Lucia
%Y Yih, Scott Wen-tau
%S Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing
%D 2021
%8 November
%I Association for Computational Linguistics
%C Online and Punta Cana, Dominican Republic
%F howcroft-rieser-2021-happens
%X Previous work has shown that human evaluations in NLP are notoriously under-powered. Here, we argue that there are two common factors which make this problem even worse: NLP studies usually (a) treat ordinal data as interval data and (b) operate under high variance settings while the differences they are hoping to detect are often subtle. We demonstrate through simulation that ordinal mixed effects models are better able to detect small differences between models, especially in high variance settings common in evaluations of generated texts. We release tools for researchers to conduct their own power analysis and test their assumptions. We also make recommendations for improving statistical power.
%R 10.18653/v1/2021.emnlp-main.703
%U https://aclanthology.org/2021.emnlp-main.703/
%U https://doi.org/10.18653/v1/2021.emnlp-main.703
%P 8932-8939
Markdown (Informal)
[What happens if you treat ordinal ratings as interval data? Human evaluations in NLP are even more under-powered than you think](https://aclanthology.org/2021.emnlp-main.703/) (Howcroft & Rieser, EMNLP 2021)
ACL