@inproceedings{scire-etal-2024-fenice,
title = "{FENICE}: Factuality Evaluation of summarization based on Natural language Inference and Claim Extraction",
author = "Scir{\`e}, Alessandro and
Ghonim, Karim and
Navigli, Roberto",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-acl.841",
doi = "10.18653/v1/2024.findings-acl.841",
pages = "14148--14161",
abstract = "Recent advancements in text summarization, particularly with the advent of Large Language Models (LLMs), have shown remarkable performance. However, a notable challenge persists as a substantial number of automatically-generated summaries exhibit factual inconsistencies, such as hallucinations. In response to this issue, various approaches for the evaluation of consistency for summarization have emerged. Yet, these newly-introduced metrics face several limitations, including lack of interpretability, focus on short document summaries (e.g., news articles), and computational impracticality, especially for LLM-based metrics. To address these shortcomings, we propose Factuality Evaluation of summarization based on Natural language Inference and Claim Extraction (FENICE), a more interpretable and efficient factuality-oriented metric. FENICE leverages an NLI-based alignment between information in the source document and a set of atomic facts, referred to as claims, extracted from the summary. Our metric sets a new state of the art on AGGREFACT, the de-facto benchmark for factuality evaluation. Moreover, we extend our evaluation to a more challenging setting by conducting a human annotation process of long-form summarization. In the hope of fostering research in summarization factuality evaluation, we release the code of our metric and our factuality annotations of long-form summarization at anonymizedurl.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="scire-etal-2024-fenice">
<titleInfo>
<title>FENICE: Factuality Evaluation of summarization based on Natural language Inference and Claim Extraction</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Scirè</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karim</namePart>
<namePart type="family">Ghonim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roberto</namePart>
<namePart type="family">Navigli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent advancements in text summarization, particularly with the advent of Large Language Models (LLMs), have shown remarkable performance. However, a notable challenge persists as a substantial number of automatically-generated summaries exhibit factual inconsistencies, such as hallucinations. In response to this issue, various approaches for the evaluation of consistency for summarization have emerged. Yet, these newly-introduced metrics face several limitations, including lack of interpretability, focus on short document summaries (e.g., news articles), and computational impracticality, especially for LLM-based metrics. To address these shortcomings, we propose Factuality Evaluation of summarization based on Natural language Inference and Claim Extraction (FENICE), a more interpretable and efficient factuality-oriented metric. FENICE leverages an NLI-based alignment between information in the source document and a set of atomic facts, referred to as claims, extracted from the summary. Our metric sets a new state of the art on AGGREFACT, the de-facto benchmark for factuality evaluation. Moreover, we extend our evaluation to a more challenging setting by conducting a human annotation process of long-form summarization. In the hope of fostering research in summarization factuality evaluation, we release the code of our metric and our factuality annotations of long-form summarization at anonymizedurl.</abstract>
<identifier type="citekey">scire-etal-2024-fenice</identifier>
<identifier type="doi">10.18653/v1/2024.findings-acl.841</identifier>
<location>
<url>https://aclanthology.org/2024.findings-acl.841</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>14148</start>
<end>14161</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T FENICE: Factuality Evaluation of summarization based on Natural language Inference and Claim Extraction
%A Scirè, Alessandro
%A Ghonim, Karim
%A Navigli, Roberto
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Findings of the Association for Computational Linguistics: ACL 2024
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F scire-etal-2024-fenice
%X Recent advancements in text summarization, particularly with the advent of Large Language Models (LLMs), have shown remarkable performance. However, a notable challenge persists as a substantial number of automatically-generated summaries exhibit factual inconsistencies, such as hallucinations. In response to this issue, various approaches for the evaluation of consistency for summarization have emerged. Yet, these newly-introduced metrics face several limitations, including lack of interpretability, focus on short document summaries (e.g., news articles), and computational impracticality, especially for LLM-based metrics. To address these shortcomings, we propose Factuality Evaluation of summarization based on Natural language Inference and Claim Extraction (FENICE), a more interpretable and efficient factuality-oriented metric. FENICE leverages an NLI-based alignment between information in the source document and a set of atomic facts, referred to as claims, extracted from the summary. Our metric sets a new state of the art on AGGREFACT, the de-facto benchmark for factuality evaluation. Moreover, we extend our evaluation to a more challenging setting by conducting a human annotation process of long-form summarization. In the hope of fostering research in summarization factuality evaluation, we release the code of our metric and our factuality annotations of long-form summarization at anonymizedurl.
%R 10.18653/v1/2024.findings-acl.841
%U https://aclanthology.org/2024.findings-acl.841
%U https://doi.org/10.18653/v1/2024.findings-acl.841
%P 14148-14161
Markdown (Informal)
[FENICE: Factuality Evaluation of summarization based on Natural language Inference and Claim Extraction](https://aclanthology.org/2024.findings-acl.841) (Scirè et al., Findings 2024)
ACL