@inproceedings{honovich-etal-2021-q2,
title = "$Q^{2}$: {E}valuating Factual Consistency in Knowledge-Grounded Dialogues via Question Generation and Question Answering",
author = "Honovich, Or and
Choshen, Leshem and
Aharoni, Roee and
Neeman, Ella and
Szpektor, Idan and
Abend, Omri",
editor = "Moens, Marie-Francine and
Huang, Xuanjing and
Specia, Lucia and
Yih, Scott Wen-tau",
booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2021",
address = "Online and Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.emnlp-main.619",
doi = "10.18653/v1/2021.emnlp-main.619",
pages = "7856--7870",
abstract = "Neural knowledge-grounded generative models for dialogue often produce content that is factually inconsistent with the knowledge they rely on, making them unreliable and limiting their applicability. Inspired by recent work on evaluating factual consistency in abstractive summarization, we propose an automatic evaluation metric for factual consistency in knowledge-grounded dialogue using automatic question generation and question answering. Our metric, denoted $Q^2$, compares answer spans using natural language inference (NLI), instead of token-based matching as done in previous work. To foster proper evaluation, we curate a novel dataset of dialogue system outputs for the Wizard-of-Wikipedia dataset, manually annotated for factual consistency. We perform a thorough meta-evaluation of $Q^2$ against other metrics using this dataset and two others, where it consistently shows higher correlation with human judgements.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="honovich-etal-2021-q2">
<titleInfo>
<title>Q²: Evaluating Factual Consistency in Knowledge-Grounded Dialogues via Question Generation and Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Or</namePart>
<namePart type="family">Honovich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leshem</namePart>
<namePart type="family">Choshen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roee</namePart>
<namePart type="family">Aharoni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ella</namePart>
<namePart type="family">Neeman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Idan</namePart>
<namePart type="family">Szpektor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Omri</namePart>
<namePart type="family">Abend</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marie-Francine</namePart>
<namePart type="family">Moens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuanjing</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="family">Specia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Scott</namePart>
<namePart type="given">Wen-tau</namePart>
<namePart type="family">Yih</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online and Punta Cana, Dominican Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Neural knowledge-grounded generative models for dialogue often produce content that is factually inconsistent with the knowledge they rely on, making them unreliable and limiting their applicability. Inspired by recent work on evaluating factual consistency in abstractive summarization, we propose an automatic evaluation metric for factual consistency in knowledge-grounded dialogue using automatic question generation and question answering. Our metric, denoted Q², compares answer spans using natural language inference (NLI), instead of token-based matching as done in previous work. To foster proper evaluation, we curate a novel dataset of dialogue system outputs for the Wizard-of-Wikipedia dataset, manually annotated for factual consistency. We perform a thorough meta-evaluation of Q² against other metrics using this dataset and two others, where it consistently shows higher correlation with human judgements.</abstract>
<identifier type="citekey">honovich-etal-2021-q2</identifier>
<identifier type="doi">10.18653/v1/2021.emnlp-main.619</identifier>
<location>
<url>https://aclanthology.org/2021.emnlp-main.619</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>7856</start>
<end>7870</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Q²: Evaluating Factual Consistency in Knowledge-Grounded Dialogues via Question Generation and Question Answering
%A Honovich, Or
%A Choshen, Leshem
%A Aharoni, Roee
%A Neeman, Ella
%A Szpektor, Idan
%A Abend, Omri
%Y Moens, Marie-Francine
%Y Huang, Xuanjing
%Y Specia, Lucia
%Y Yih, Scott Wen-tau
%S Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing
%D 2021
%8 November
%I Association for Computational Linguistics
%C Online and Punta Cana, Dominican Republic
%F honovich-etal-2021-q2
%X Neural knowledge-grounded generative models for dialogue often produce content that is factually inconsistent with the knowledge they rely on, making them unreliable and limiting their applicability. Inspired by recent work on evaluating factual consistency in abstractive summarization, we propose an automatic evaluation metric for factual consistency in knowledge-grounded dialogue using automatic question generation and question answering. Our metric, denoted Q², compares answer spans using natural language inference (NLI), instead of token-based matching as done in previous work. To foster proper evaluation, we curate a novel dataset of dialogue system outputs for the Wizard-of-Wikipedia dataset, manually annotated for factual consistency. We perform a thorough meta-evaluation of Q² against other metrics using this dataset and two others, where it consistently shows higher correlation with human judgements.
%R 10.18653/v1/2021.emnlp-main.619
%U https://aclanthology.org/2021.emnlp-main.619
%U https://doi.org/10.18653/v1/2021.emnlp-main.619
%P 7856-7870
Markdown (Informal)
[Q2: Evaluating Factual Consistency in Knowledge-Grounded Dialogues via Question Generation and Question Answering](https://aclanthology.org/2021.emnlp-main.619) (Honovich et al., EMNLP 2021)
ACL