@inproceedings{gao-etal-2023-reproduction,
title = "A Reproduction Study of the Human Evaluation of Role-Oriented Dialogue Summarization Models",
author = "Gao, Mingqi and
Ruan, Jie and
Wan, Xiaojun",
editor = "Belz, Anya and
Popovi{\'c}, Maja and
Reiter, Ehud and
Thomson, Craig and
Sedoc, Jo{\~a}o",
booktitle = "Proceedings of the 3rd Workshop on Human Evaluation of NLP Systems",
month = sep,
year = "2023",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2023.humeval-1.10",
pages = "124--129",
abstract = "This paper reports a reproduction study of the human evaluation of role-oriented dialogue summarization models, as part of the ReproNLP Shared Task 2023 on Reproducibility of Evaluations in NLP. We outline the disparities between the original study{'}s experimental design and our reproduction study, along with the outcomes obtained. The inter-annotator agreement within the reproduction study is observed to be lower, measuring 0.40 as compared to the original study{'}s 0.48. Among the six conclusions drawn in the original study, four are validated in our reproduction study. We confirm the effectiveness of the proposed approach on the overall metric, albeit with slightly poorer relative performance compared to the original study. Furthermore, we raise an open-ended inquiry: how can subjective practices in the original study be identified and addressed when conducting reproduction studies?",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gao-etal-2023-reproduction">
<titleInfo>
<title>A Reproduction Study of the Human Evaluation of Role-Oriented Dialogue Summarization Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mingqi</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jie</namePart>
<namePart type="family">Ruan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaojun</namePart>
<namePart type="family">Wan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 3rd Workshop on Human Evaluation of NLP Systems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anya</namePart>
<namePart type="family">Belz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maja</namePart>
<namePart type="family">Popović</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ehud</namePart>
<namePart type="family">Reiter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Craig</namePart>
<namePart type="family">Thomson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">João</namePart>
<namePart type="family">Sedoc</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper reports a reproduction study of the human evaluation of role-oriented dialogue summarization models, as part of the ReproNLP Shared Task 2023 on Reproducibility of Evaluations in NLP. We outline the disparities between the original study’s experimental design and our reproduction study, along with the outcomes obtained. The inter-annotator agreement within the reproduction study is observed to be lower, measuring 0.40 as compared to the original study’s 0.48. Among the six conclusions drawn in the original study, four are validated in our reproduction study. We confirm the effectiveness of the proposed approach on the overall metric, albeit with slightly poorer relative performance compared to the original study. Furthermore, we raise an open-ended inquiry: how can subjective practices in the original study be identified and addressed when conducting reproduction studies?</abstract>
<identifier type="citekey">gao-etal-2023-reproduction</identifier>
<location>
<url>https://aclanthology.org/2023.humeval-1.10</url>
</location>
<part>
<date>2023-09</date>
<extent unit="page">
<start>124</start>
<end>129</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Reproduction Study of the Human Evaluation of Role-Oriented Dialogue Summarization Models
%A Gao, Mingqi
%A Ruan, Jie
%A Wan, Xiaojun
%Y Belz, Anya
%Y Popović, Maja
%Y Reiter, Ehud
%Y Thomson, Craig
%Y Sedoc, João
%S Proceedings of the 3rd Workshop on Human Evaluation of NLP Systems
%D 2023
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F gao-etal-2023-reproduction
%X This paper reports a reproduction study of the human evaluation of role-oriented dialogue summarization models, as part of the ReproNLP Shared Task 2023 on Reproducibility of Evaluations in NLP. We outline the disparities between the original study’s experimental design and our reproduction study, along with the outcomes obtained. The inter-annotator agreement within the reproduction study is observed to be lower, measuring 0.40 as compared to the original study’s 0.48. Among the six conclusions drawn in the original study, four are validated in our reproduction study. We confirm the effectiveness of the proposed approach on the overall metric, albeit with slightly poorer relative performance compared to the original study. Furthermore, we raise an open-ended inquiry: how can subjective practices in the original study be identified and addressed when conducting reproduction studies?
%U https://aclanthology.org/2023.humeval-1.10
%P 124-129
Markdown (Informal)
[A Reproduction Study of the Human Evaluation of Role-Oriented Dialogue Summarization Models](https://aclanthology.org/2023.humeval-1.10) (Gao et al., HumEval-WS 2023)
ACL