@inproceedings{nyberg-etal-2021-estimating,
title = "Estimating Subjective Crowd-Evaluations as an Additional Objective to Improve Natural Language Generation",
author = "Nyberg, Jakob and
Paetzel, Maike and
Manuvinakurike, Ramesh",
editor = "Belz, Anya and
Agarwal, Shubham and
Graham, Yvette and
Reiter, Ehud and
Shimorina, Anastasia",
booktitle = "Proceedings of the Workshop on Human Evaluation of NLP Systems (HumEval)",
month = apr,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.humeval-1.2",
pages = "13--24",
abstract = "Human ratings are one of the most prevalent methods to evaluate the performance of NLP (natural language processing) algorithms. Similarly, it is common to measure the quality of sentences generated by a natural language generation model using human raters. In this paper we argue for exploring the use of subjective evaluations within the process of training language generation models in a multi-task learning setting. As a case study, we use a crowd-authored dialogue corpus to fine-tune six different language generation models. Two of these models incorporate multi-task learning and use subjective ratings of lines as part of an explicit learning goal. A human evaluation of the generated dialogue lines reveals that utterances generated by the multi-tasking models were subjectively rated as the most typical, most moving the conversation forward, and least offensive. Based on these promising first results, we discuss future research directions for incorporating subjective human evaluations into language model training and to hence keep the human user in the loop during the development process.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nyberg-etal-2021-estimating">
<titleInfo>
<title>Estimating Subjective Crowd-Evaluations as an Additional Objective to Improve Natural Language Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jakob</namePart>
<namePart type="family">Nyberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maike</namePart>
<namePart type="family">Paetzel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ramesh</namePart>
<namePart type="family">Manuvinakurike</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Human Evaluation of NLP Systems (HumEval)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anya</namePart>
<namePart type="family">Belz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shubham</namePart>
<namePart type="family">Agarwal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yvette</namePart>
<namePart type="family">Graham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ehud</namePart>
<namePart type="family">Reiter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anastasia</namePart>
<namePart type="family">Shimorina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Human ratings are one of the most prevalent methods to evaluate the performance of NLP (natural language processing) algorithms. Similarly, it is common to measure the quality of sentences generated by a natural language generation model using human raters. In this paper we argue for exploring the use of subjective evaluations within the process of training language generation models in a multi-task learning setting. As a case study, we use a crowd-authored dialogue corpus to fine-tune six different language generation models. Two of these models incorporate multi-task learning and use subjective ratings of lines as part of an explicit learning goal. A human evaluation of the generated dialogue lines reveals that utterances generated by the multi-tasking models were subjectively rated as the most typical, most moving the conversation forward, and least offensive. Based on these promising first results, we discuss future research directions for incorporating subjective human evaluations into language model training and to hence keep the human user in the loop during the development process.</abstract>
<identifier type="citekey">nyberg-etal-2021-estimating</identifier>
<location>
<url>https://aclanthology.org/2021.humeval-1.2</url>
</location>
<part>
<date>2021-04</date>
<extent unit="page">
<start>13</start>
<end>24</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Estimating Subjective Crowd-Evaluations as an Additional Objective to Improve Natural Language Generation
%A Nyberg, Jakob
%A Paetzel, Maike
%A Manuvinakurike, Ramesh
%Y Belz, Anya
%Y Agarwal, Shubham
%Y Graham, Yvette
%Y Reiter, Ehud
%Y Shimorina, Anastasia
%S Proceedings of the Workshop on Human Evaluation of NLP Systems (HumEval)
%D 2021
%8 April
%I Association for Computational Linguistics
%C Online
%F nyberg-etal-2021-estimating
%X Human ratings are one of the most prevalent methods to evaluate the performance of NLP (natural language processing) algorithms. Similarly, it is common to measure the quality of sentences generated by a natural language generation model using human raters. In this paper we argue for exploring the use of subjective evaluations within the process of training language generation models in a multi-task learning setting. As a case study, we use a crowd-authored dialogue corpus to fine-tune six different language generation models. Two of these models incorporate multi-task learning and use subjective ratings of lines as part of an explicit learning goal. A human evaluation of the generated dialogue lines reveals that utterances generated by the multi-tasking models were subjectively rated as the most typical, most moving the conversation forward, and least offensive. Based on these promising first results, we discuss future research directions for incorporating subjective human evaluations into language model training and to hence keep the human user in the loop during the development process.
%U https://aclanthology.org/2021.humeval-1.2
%P 13-24
Markdown (Informal)
[Estimating Subjective Crowd-Evaluations as an Additional Objective to Improve Natural Language Generation](https://aclanthology.org/2021.humeval-1.2) (Nyberg et al., HumEval 2021)
ACL