@inproceedings{ren-etal-2023-c,
title = "{C}-{PMI}: Conditional Pointwise Mutual Information for Turn-level Dialogue Evaluation",
author = "Ren, Liliang and
Sidhu, Mankeerat and
Zeng, Qi and
Gangi Reddy, Revanth and
Ji, Heng and
Zhai, ChengXiang",
editor = "Muresan, Smaranda and
Chen, Vivian and
Casey, Kennington and
David, Vandyke and
Nina, Dethlefs and
Koji, Inoue and
Erik, Ekstedt and
Stefan, Ultes",
booktitle = "Proceedings of the Third DialDoc Workshop on Document-grounded Dialogue and Conversational Question Answering",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.dialdoc-1.9/",
doi = "10.18653/v1/2023.dialdoc-1.9",
pages = "80--85",
abstract = "Existing reference-free turn-level evaluation metrics for chatbots inadequately capture the interaction between the user and the system. Consequently, they often correlate poorly with human evaluations. To address this issue, we propose a novel model-agnostic approach that leverages Conditional Pointwise Mutual Information (C-PMI) to measure the turn-level interaction between the system and the user based on a given evaluation dimension. Experimental results on the widely used FED dialogue evaluation dataset demonstrate that our approach significantly improves the correlation with human judgment compared with existing evaluation systems. By replacing the negative log-likelihood-based scorer with our proposed C-PMI scorer, we achieve a relative 60.5{\%} higher Spearman correlation on average for the FED evaluation metric. Our code is publicly available at \url{https://github.com/renll/C-PMI}."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ren-etal-2023-c">
<titleInfo>
<title>C-PMI: Conditional Pointwise Mutual Information for Turn-level Dialogue Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Liliang</namePart>
<namePart type="family">Ren</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mankeerat</namePart>
<namePart type="family">Sidhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qi</namePart>
<namePart type="family">Zeng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Revanth</namePart>
<namePart type="family">Gangi Reddy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heng</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">ChengXiang</namePart>
<namePart type="family">Zhai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third DialDoc Workshop on Document-grounded Dialogue and Conversational Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Smaranda</namePart>
<namePart type="family">Muresan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivian</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kennington</namePart>
<namePart type="family">Casey</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vandyke</namePart>
<namePart type="family">David</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dethlefs</namePart>
<namePart type="family">Nina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Inoue</namePart>
<namePart type="family">Koji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekstedt</namePart>
<namePart type="family">Erik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ultes</namePart>
<namePart type="family">Stefan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Existing reference-free turn-level evaluation metrics for chatbots inadequately capture the interaction between the user and the system. Consequently, they often correlate poorly with human evaluations. To address this issue, we propose a novel model-agnostic approach that leverages Conditional Pointwise Mutual Information (C-PMI) to measure the turn-level interaction between the system and the user based on a given evaluation dimension. Experimental results on the widely used FED dialogue evaluation dataset demonstrate that our approach significantly improves the correlation with human judgment compared with existing evaluation systems. By replacing the negative log-likelihood-based scorer with our proposed C-PMI scorer, we achieve a relative 60.5% higher Spearman correlation on average for the FED evaluation metric. Our code is publicly available at https://github.com/renll/C-PMI.</abstract>
<identifier type="citekey">ren-etal-2023-c</identifier>
<identifier type="doi">10.18653/v1/2023.dialdoc-1.9</identifier>
<location>
<url>https://aclanthology.org/2023.dialdoc-1.9/</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>80</start>
<end>85</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T C-PMI: Conditional Pointwise Mutual Information for Turn-level Dialogue Evaluation
%A Ren, Liliang
%A Sidhu, Mankeerat
%A Zeng, Qi
%A Gangi Reddy, Revanth
%A Ji, Heng
%A Zhai, ChengXiang
%Y Muresan, Smaranda
%Y Chen, Vivian
%Y Casey, Kennington
%Y David, Vandyke
%Y Nina, Dethlefs
%Y Koji, Inoue
%Y Erik, Ekstedt
%Y Stefan, Ultes
%S Proceedings of the Third DialDoc Workshop on Document-grounded Dialogue and Conversational Question Answering
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F ren-etal-2023-c
%X Existing reference-free turn-level evaluation metrics for chatbots inadequately capture the interaction between the user and the system. Consequently, they often correlate poorly with human evaluations. To address this issue, we propose a novel model-agnostic approach that leverages Conditional Pointwise Mutual Information (C-PMI) to measure the turn-level interaction between the system and the user based on a given evaluation dimension. Experimental results on the widely used FED dialogue evaluation dataset demonstrate that our approach significantly improves the correlation with human judgment compared with existing evaluation systems. By replacing the negative log-likelihood-based scorer with our proposed C-PMI scorer, we achieve a relative 60.5% higher Spearman correlation on average for the FED evaluation metric. Our code is publicly available at https://github.com/renll/C-PMI.
%R 10.18653/v1/2023.dialdoc-1.9
%U https://aclanthology.org/2023.dialdoc-1.9/
%U https://doi.org/10.18653/v1/2023.dialdoc-1.9
%P 80-85
Markdown (Informal)
[C-PMI: Conditional Pointwise Mutual Information for Turn-level Dialogue Evaluation](https://aclanthology.org/2023.dialdoc-1.9/) (Ren et al., dialdoc 2023)
ACL