@inproceedings{shen-etal-2022-evaluation,
title = "On the Evaluation Metrics for Paraphrase Generation",
author = "Shen, Lingfeng and
Liu, Lemao and
Jiang, Haiyun and
Shi, Shuming",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.emnlp-main.208/",
doi = "10.18653/v1/2022.emnlp-main.208",
pages = "3178--3190",
abstract = "In this paper we revisit automatic metrics for paraphrase evaluation and obtain two findings that disobey conventional wisdom: (1) Reference-free metrics achieve better performance than their reference-based counterparts. (2) Most commonly used metrics do not align well with human annotation.Underlying reasons behind the above findings are explored through additional experiments and in-depth analyses.Based on the experiments and analyses, we propose ParaScore, a new evaluation metric for paraphrase generation. It possesses the merits of reference-based and reference-free metrics and explicitly models lexical divergence. Based on our analysis and improvements, our proposed reference-based outperforms than reference-free metrics.Experimental results demonstrate that ParaScore significantly outperforms existing metrics."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="shen-etal-2022-evaluation">
<titleInfo>
<title>On the Evaluation Metrics for Paraphrase Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lingfeng</namePart>
<namePart type="family">Shen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lemao</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haiyun</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuming</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper we revisit automatic metrics for paraphrase evaluation and obtain two findings that disobey conventional wisdom: (1) Reference-free metrics achieve better performance than their reference-based counterparts. (2) Most commonly used metrics do not align well with human annotation.Underlying reasons behind the above findings are explored through additional experiments and in-depth analyses.Based on the experiments and analyses, we propose ParaScore, a new evaluation metric for paraphrase generation. It possesses the merits of reference-based and reference-free metrics and explicitly models lexical divergence. Based on our analysis and improvements, our proposed reference-based outperforms than reference-free metrics.Experimental results demonstrate that ParaScore significantly outperforms existing metrics.</abstract>
<identifier type="citekey">shen-etal-2022-evaluation</identifier>
<identifier type="doi">10.18653/v1/2022.emnlp-main.208</identifier>
<location>
<url>https://aclanthology.org/2022.emnlp-main.208/</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>3178</start>
<end>3190</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T On the Evaluation Metrics for Paraphrase Generation
%A Shen, Lingfeng
%A Liu, Lemao
%A Jiang, Haiyun
%A Shi, Shuming
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F shen-etal-2022-evaluation
%X In this paper we revisit automatic metrics for paraphrase evaluation and obtain two findings that disobey conventional wisdom: (1) Reference-free metrics achieve better performance than their reference-based counterparts. (2) Most commonly used metrics do not align well with human annotation.Underlying reasons behind the above findings are explored through additional experiments and in-depth analyses.Based on the experiments and analyses, we propose ParaScore, a new evaluation metric for paraphrase generation. It possesses the merits of reference-based and reference-free metrics and explicitly models lexical divergence. Based on our analysis and improvements, our proposed reference-based outperforms than reference-free metrics.Experimental results demonstrate that ParaScore significantly outperforms existing metrics.
%R 10.18653/v1/2022.emnlp-main.208
%U https://aclanthology.org/2022.emnlp-main.208/
%U https://doi.org/10.18653/v1/2022.emnlp-main.208
%P 3178-3190
Markdown (Informal)
[On the Evaluation Metrics for Paraphrase Generation](https://aclanthology.org/2022.emnlp-main.208/) (Shen et al., EMNLP 2022)
ACL
- Lingfeng Shen, Lemao Liu, Haiyun Jiang, and Shuming Shi. 2022. On the Evaluation Metrics for Paraphrase Generation. In Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, pages 3178–3190, Abu Dhabi, United Arab Emirates. Association for Computational Linguistics.