@inproceedings{van-miltenburg-etal-2023-reproducible,
title = "How reproducible is best-worst scaling for human evaluation? A reproduction of {`}Data-to-text Generation with Macro Planning{'}",
author = "van Miltenburg, Emiel and
Braggaar, Anouck and
Braun, Nadine and
Damen, Debby and
Goudbeek, Martijn and
van der Lee, Chris and
Tomas, Fr{\'e}d{\'e}ric and
Krahmer, Emiel",
editor = "Belz, Anya and
Popovi{\'c}, Maja and
Reiter, Ehud and
Thomson, Craig and
Sedoc, Jo{\~a}o",
booktitle = "Proceedings of the 3rd Workshop on Human Evaluation of NLP Systems",
month = sep,
year = "2023",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2023.humeval-1.7",
pages = "75--88",
abstract = "This paper is part of the larger ReproHum project, where different teams of researchers aim to reproduce published experiments from the NLP literature. Specifically, ReproHum focuses on the reproducibility of human evaluation studies, where participants indicate the quality of different outputs of Natural Language Generation (NLG) systems. This is necessary because without reproduction studies, we do not know how reliable earlier results are. This paper aims to reproduce the second human evaluation study of Puduppully {\&} Lapata (2021), while another lab is attempting to do the same. This experiment uses best-worst scaling to determine the relative performance of different NLG systems. We found that the worst performing system in the original study is now in fact the best performing system across the board. This means that we cannot fully reproduce the original results. We also carry out alternative analyses of the data, and discuss how our results may be combined with the other reproduction study that is carried out in parallel with this paper.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="van-miltenburg-etal-2023-reproducible">
<titleInfo>
<title>How reproducible is best-worst scaling for human evaluation? A reproduction of ‘Data-to-text Generation with Macro Planning’</title>
</titleInfo>
<name type="personal">
<namePart type="given">Emiel</namePart>
<namePart type="family">van Miltenburg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anouck</namePart>
<namePart type="family">Braggaar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nadine</namePart>
<namePart type="family">Braun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Debby</namePart>
<namePart type="family">Damen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Martijn</namePart>
<namePart type="family">Goudbeek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chris</namePart>
<namePart type="family">van der Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frédéric</namePart>
<namePart type="family">Tomas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emiel</namePart>
<namePart type="family">Krahmer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 3rd Workshop on Human Evaluation of NLP Systems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anya</namePart>
<namePart type="family">Belz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maja</namePart>
<namePart type="family">Popović</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ehud</namePart>
<namePart type="family">Reiter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Craig</namePart>
<namePart type="family">Thomson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">João</namePart>
<namePart type="family">Sedoc</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper is part of the larger ReproHum project, where different teams of researchers aim to reproduce published experiments from the NLP literature. Specifically, ReproHum focuses on the reproducibility of human evaluation studies, where participants indicate the quality of different outputs of Natural Language Generation (NLG) systems. This is necessary because without reproduction studies, we do not know how reliable earlier results are. This paper aims to reproduce the second human evaluation study of Puduppully & Lapata (2021), while another lab is attempting to do the same. This experiment uses best-worst scaling to determine the relative performance of different NLG systems. We found that the worst performing system in the original study is now in fact the best performing system across the board. This means that we cannot fully reproduce the original results. We also carry out alternative analyses of the data, and discuss how our results may be combined with the other reproduction study that is carried out in parallel with this paper.</abstract>
<identifier type="citekey">van-miltenburg-etal-2023-reproducible</identifier>
<location>
<url>https://aclanthology.org/2023.humeval-1.7</url>
</location>
<part>
<date>2023-09</date>
<extent unit="page">
<start>75</start>
<end>88</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T How reproducible is best-worst scaling for human evaluation? A reproduction of ‘Data-to-text Generation with Macro Planning’
%A van Miltenburg, Emiel
%A Braggaar, Anouck
%A Braun, Nadine
%A Damen, Debby
%A Goudbeek, Martijn
%A van der Lee, Chris
%A Tomas, Frédéric
%A Krahmer, Emiel
%Y Belz, Anya
%Y Popović, Maja
%Y Reiter, Ehud
%Y Thomson, Craig
%Y Sedoc, João
%S Proceedings of the 3rd Workshop on Human Evaluation of NLP Systems
%D 2023
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F van-miltenburg-etal-2023-reproducible
%X This paper is part of the larger ReproHum project, where different teams of researchers aim to reproduce published experiments from the NLP literature. Specifically, ReproHum focuses on the reproducibility of human evaluation studies, where participants indicate the quality of different outputs of Natural Language Generation (NLG) systems. This is necessary because without reproduction studies, we do not know how reliable earlier results are. This paper aims to reproduce the second human evaluation study of Puduppully & Lapata (2021), while another lab is attempting to do the same. This experiment uses best-worst scaling to determine the relative performance of different NLG systems. We found that the worst performing system in the original study is now in fact the best performing system across the board. This means that we cannot fully reproduce the original results. We also carry out alternative analyses of the data, and discuss how our results may be combined with the other reproduction study that is carried out in parallel with this paper.
%U https://aclanthology.org/2023.humeval-1.7
%P 75-88
Markdown (Informal)
[How reproducible is best-worst scaling for human evaluation? A reproduction of ‘Data-to-text Generation with Macro Planning’](https://aclanthology.org/2023.humeval-1.7) (van Miltenburg et al., HumEval-WS 2023)
ACL
- Emiel van Miltenburg, Anouck Braggaar, Nadine Braun, Debby Damen, Martijn Goudbeek, Chris van der Lee, Frédéric Tomas, and Emiel Krahmer. 2023. How reproducible is best-worst scaling for human evaluation? A reproduction of ‘Data-to-text Generation with Macro Planning’. In Proceedings of the 3rd Workshop on Human Evaluation of NLP Systems, pages 75–88, Varna, Bulgaria. INCOMA Ltd., Shoumen, Bulgaria.