@article{fabbri-etal-2021-summeval,
title = "{S}umm{E}val: Re-evaluating Summarization Evaluation",
author = "Fabbri, Alexander R. and
Kry{\'s}ci{\'n}ski, Wojciech and
McCann, Bryan and
Xiong, Caiming and
Socher, Richard and
Radev, Dragomir",
editor = "Roark, Brian and
Nenkova, Ani",
journal = "Transactions of the Association for Computational Linguistics",
volume = "9",
year = "2021",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2021.tacl-1.24",
doi = "10.1162/tacl_a_00373",
pages = "391--409",
abstract = "The scarcity of comprehensive up-to-date studies on evaluation metrics for text summarization and the lack of consensus regarding evaluation protocols continue to inhibit progress. We address the existing shortcomings of summarization evaluation methods along five dimensions: 1) we re-evaluate 14 automatic evaluation metrics in a comprehensive and consistent fashion using neural summarization model outputs along with expert and crowd-sourced human annotations; 2) we consistently benchmark 23 recent summarization models using the aforementioned automatic evaluation metrics; 3) we assemble the largest collection of summaries generated by models trained on the CNN/DailyMail news dataset and share it in a unified format; 4) we implement and share a toolkit that provides an extensible and unified API for evaluating summarization models across a broad range of automatic metrics; and 5) we assemble and share the largest and most diverse, in terms of model types, collection of human judgments of model-generated summaries on the CNN/Daily Mail dataset annotated by both expert judges and crowd-source workers. We hope that this work will help promote a more complete evaluation protocol for text summarization as well as advance research in developing evaluation metrics that better correlate with human judgments.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="fabbri-etal-2021-summeval">
<titleInfo>
<title>SummEval: Re-evaluating Summarization Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alexander</namePart>
<namePart type="given">R</namePart>
<namePart type="family">Fabbri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wojciech</namePart>
<namePart type="family">Kryściński</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bryan</namePart>
<namePart type="family">McCann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Caiming</namePart>
<namePart type="family">Xiong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Richard</namePart>
<namePart type="family">Socher</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dragomir</namePart>
<namePart type="family">Radev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>The scarcity of comprehensive up-to-date studies on evaluation metrics for text summarization and the lack of consensus regarding evaluation protocols continue to inhibit progress. We address the existing shortcomings of summarization evaluation methods along five dimensions: 1) we re-evaluate 14 automatic evaluation metrics in a comprehensive and consistent fashion using neural summarization model outputs along with expert and crowd-sourced human annotations; 2) we consistently benchmark 23 recent summarization models using the aforementioned automatic evaluation metrics; 3) we assemble the largest collection of summaries generated by models trained on the CNN/DailyMail news dataset and share it in a unified format; 4) we implement and share a toolkit that provides an extensible and unified API for evaluating summarization models across a broad range of automatic metrics; and 5) we assemble and share the largest and most diverse, in terms of model types, collection of human judgments of model-generated summaries on the CNN/Daily Mail dataset annotated by both expert judges and crowd-source workers. We hope that this work will help promote a more complete evaluation protocol for text summarization as well as advance research in developing evaluation metrics that better correlate with human judgments.</abstract>
<identifier type="citekey">fabbri-etal-2021-summeval</identifier>
<identifier type="doi">10.1162/tacl_a_00373</identifier>
<location>
<url>https://aclanthology.org/2021.tacl-1.24</url>
</location>
<part>
<date>2021</date>
<detail type="volume"><number>9</number></detail>
<extent unit="page">
<start>391</start>
<end>409</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T SummEval: Re-evaluating Summarization Evaluation
%A Fabbri, Alexander R.
%A Kryściński, Wojciech
%A McCann, Bryan
%A Xiong, Caiming
%A Socher, Richard
%A Radev, Dragomir
%J Transactions of the Association for Computational Linguistics
%D 2021
%V 9
%I MIT Press
%C Cambridge, MA
%F fabbri-etal-2021-summeval
%X The scarcity of comprehensive up-to-date studies on evaluation metrics for text summarization and the lack of consensus regarding evaluation protocols continue to inhibit progress. We address the existing shortcomings of summarization evaluation methods along five dimensions: 1) we re-evaluate 14 automatic evaluation metrics in a comprehensive and consistent fashion using neural summarization model outputs along with expert and crowd-sourced human annotations; 2) we consistently benchmark 23 recent summarization models using the aforementioned automatic evaluation metrics; 3) we assemble the largest collection of summaries generated by models trained on the CNN/DailyMail news dataset and share it in a unified format; 4) we implement and share a toolkit that provides an extensible and unified API for evaluating summarization models across a broad range of automatic metrics; and 5) we assemble and share the largest and most diverse, in terms of model types, collection of human judgments of model-generated summaries on the CNN/Daily Mail dataset annotated by both expert judges and crowd-source workers. We hope that this work will help promote a more complete evaluation protocol for text summarization as well as advance research in developing evaluation metrics that better correlate with human judgments.
%R 10.1162/tacl_a_00373
%U https://aclanthology.org/2021.tacl-1.24
%U https://doi.org/10.1162/tacl_a_00373
%P 391-409
Markdown (Informal)
[SummEval: Re-evaluating Summarization Evaluation](https://aclanthology.org/2021.tacl-1.24) (Fabbri et al., TACL 2021)
ACL