@inproceedings{akkasi-etal-2023-reference,
title = "Reference-Free Summarization Evaluation with Large Language Models",
author = "Akkasi, Abbas and
Fraser, Kathleen and
Komeili, Majid",
editor = {Deutsch, Daniel and
Dror, Rotem and
Eger, Steffen and
Gao, Yang and
Leiter, Christoph and
Opitz, Juri and
R{\"u}ckl{\'e}, Andreas},
booktitle = "Proceedings of the 4th Workshop on Evaluation and Comparison of NLP Systems",
month = nov,
year = "2023",
address = "Bali, Indonesia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.eval4nlp-1.16/",
doi = "10.18653/v1/2023.eval4nlp-1.16",
pages = "193--201",
abstract = "With the continuous advancement in unsupervised learning methodologies, text generation has become increasingly pervasive. However, the evaluation of the quality of the generated text remains challenging. Human annotations are expensive and often show high levels of disagreement, in particular for certain tasks characterized by inherent subjectivity, such as translation and summarization.Consequently, the demand for automated metrics that can reliably assess the quality of such generative systems and their outputs has grown more pronounced than ever. In 2023, Eval4NLP organized a shared task dedicated to the automatic evaluation of outputs from two specific categories of generative systems: machine translation and summarization. This evaluation was achieved through the utilization of prompts with Large Language Models. Participating in the summarization evaluation track, we propose an approach that involves prompting LLMs to evaluate six different latent dimensions of summarization quality. In contrast to many previous approaches to summarization assessments, which emphasize lexical overlap with reference text, this method surfaces the importance of correct syntax in summarization evaluation. Our method resulted in the second-highest performance in this shared task, demonstrating its effectiveness as a reference-free evaluation."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="akkasi-etal-2023-reference">
<titleInfo>
<title>Reference-Free Summarization Evaluation with Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Abbas</namePart>
<namePart type="family">Akkasi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kathleen</namePart>
<namePart type="family">Fraser</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Majid</namePart>
<namePart type="family">Komeili</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th Workshop on Evaluation and Comparison of NLP Systems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Deutsch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rotem</namePart>
<namePart type="family">Dror</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steffen</namePart>
<namePart type="family">Eger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christoph</namePart>
<namePart type="family">Leiter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juri</namePart>
<namePart type="family">Opitz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Rücklé</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bali, Indonesia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>With the continuous advancement in unsupervised learning methodologies, text generation has become increasingly pervasive. However, the evaluation of the quality of the generated text remains challenging. Human annotations are expensive and often show high levels of disagreement, in particular for certain tasks characterized by inherent subjectivity, such as translation and summarization.Consequently, the demand for automated metrics that can reliably assess the quality of such generative systems and their outputs has grown more pronounced than ever. In 2023, Eval4NLP organized a shared task dedicated to the automatic evaluation of outputs from two specific categories of generative systems: machine translation and summarization. This evaluation was achieved through the utilization of prompts with Large Language Models. Participating in the summarization evaluation track, we propose an approach that involves prompting LLMs to evaluate six different latent dimensions of summarization quality. In contrast to many previous approaches to summarization assessments, which emphasize lexical overlap with reference text, this method surfaces the importance of correct syntax in summarization evaluation. Our method resulted in the second-highest performance in this shared task, demonstrating its effectiveness as a reference-free evaluation.</abstract>
<identifier type="citekey">akkasi-etal-2023-reference</identifier>
<identifier type="doi">10.18653/v1/2023.eval4nlp-1.16</identifier>
<location>
<url>https://aclanthology.org/2023.eval4nlp-1.16/</url>
</location>
<part>
<date>2023-11</date>
<extent unit="page">
<start>193</start>
<end>201</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Reference-Free Summarization Evaluation with Large Language Models
%A Akkasi, Abbas
%A Fraser, Kathleen
%A Komeili, Majid
%Y Deutsch, Daniel
%Y Dror, Rotem
%Y Eger, Steffen
%Y Gao, Yang
%Y Leiter, Christoph
%Y Opitz, Juri
%Y Rücklé, Andreas
%S Proceedings of the 4th Workshop on Evaluation and Comparison of NLP Systems
%D 2023
%8 November
%I Association for Computational Linguistics
%C Bali, Indonesia
%F akkasi-etal-2023-reference
%X With the continuous advancement in unsupervised learning methodologies, text generation has become increasingly pervasive. However, the evaluation of the quality of the generated text remains challenging. Human annotations are expensive and often show high levels of disagreement, in particular for certain tasks characterized by inherent subjectivity, such as translation and summarization.Consequently, the demand for automated metrics that can reliably assess the quality of such generative systems and their outputs has grown more pronounced than ever. In 2023, Eval4NLP organized a shared task dedicated to the automatic evaluation of outputs from two specific categories of generative systems: machine translation and summarization. This evaluation was achieved through the utilization of prompts with Large Language Models. Participating in the summarization evaluation track, we propose an approach that involves prompting LLMs to evaluate six different latent dimensions of summarization quality. In contrast to many previous approaches to summarization assessments, which emphasize lexical overlap with reference text, this method surfaces the importance of correct syntax in summarization evaluation. Our method resulted in the second-highest performance in this shared task, demonstrating its effectiveness as a reference-free evaluation.
%R 10.18653/v1/2023.eval4nlp-1.16
%U https://aclanthology.org/2023.eval4nlp-1.16/
%U https://doi.org/10.18653/v1/2023.eval4nlp-1.16
%P 193-201
Markdown (Informal)
[Reference-Free Summarization Evaluation with Large Language Models](https://aclanthology.org/2023.eval4nlp-1.16/) (Akkasi et al., Eval4NLP 2023)
ACL