@inproceedings{blagec-etal-2022-global,
title = "A global analysis of metrics used for measuring performance in natural language processing",
author = "Blagec, Kathrin and
Dorffner, Georg and
Moradi, Milad and
Ott, Simon and
Samwald, Matthias",
editor = "Shavrina, Tatiana and
Mikhailov, Vladislav and
Malykh, Valentin and
Artemova, Ekaterina and
Serikov, Oleg and
Protasov, Vitaly",
booktitle = "Proceedings of NLP Power! The First Workshop on Efficient Benchmarking in NLP",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.nlppower-1.6",
doi = "10.18653/v1/2022.nlppower-1.6",
pages = "52--63",
abstract = "Measuring the performance of natural language processing models is challenging. Traditionally used metrics, such as BLEU and ROUGE, originally devised for machine translation and summarization, have been shown to suffer from low correlation with human judgment and a lack of transferability to other tasks and languages. In the past 15 years, a wide range of alternative metrics have been proposed. However, it is unclear to what extent this has had an impact on NLP benchmarking efforts. Here we provide the first large-scale cross-sectional analysis of metrics used for measuring performance in natural language processing. We curated, mapped and systematized more than 3500 machine learning model performance results from the open repository {`}Papers with Code{'} to enable a global and comprehensive analysis. Our results suggest that the large majority of natural language processing metrics currently used have properties that may result in an inadequate reflection of a models{'} performance. Furthermore, we found that ambiguities and inconsistencies in the reporting of metrics may lead to difficulties in interpreting and comparing model performances, impairing transparency and reproducibility in NLP research.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="blagec-etal-2022-global">
<titleInfo>
<title>A global analysis of metrics used for measuring performance in natural language processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kathrin</namePart>
<namePart type="family">Blagec</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Georg</namePart>
<namePart type="family">Dorffner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Milad</namePart>
<namePart type="family">Moradi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Ott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthias</namePart>
<namePart type="family">Samwald</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of NLP Power! The First Workshop on Efficient Benchmarking in NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tatiana</namePart>
<namePart type="family">Shavrina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vladislav</namePart>
<namePart type="family">Mikhailov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Valentin</namePart>
<namePart type="family">Malykh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Artemova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oleg</namePart>
<namePart type="family">Serikov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vitaly</namePart>
<namePart type="family">Protasov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Measuring the performance of natural language processing models is challenging. Traditionally used metrics, such as BLEU and ROUGE, originally devised for machine translation and summarization, have been shown to suffer from low correlation with human judgment and a lack of transferability to other tasks and languages. In the past 15 years, a wide range of alternative metrics have been proposed. However, it is unclear to what extent this has had an impact on NLP benchmarking efforts. Here we provide the first large-scale cross-sectional analysis of metrics used for measuring performance in natural language processing. We curated, mapped and systematized more than 3500 machine learning model performance results from the open repository ‘Papers with Code’ to enable a global and comprehensive analysis. Our results suggest that the large majority of natural language processing metrics currently used have properties that may result in an inadequate reflection of a models’ performance. Furthermore, we found that ambiguities and inconsistencies in the reporting of metrics may lead to difficulties in interpreting and comparing model performances, impairing transparency and reproducibility in NLP research.</abstract>
<identifier type="citekey">blagec-etal-2022-global</identifier>
<identifier type="doi">10.18653/v1/2022.nlppower-1.6</identifier>
<location>
<url>https://aclanthology.org/2022.nlppower-1.6</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>52</start>
<end>63</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A global analysis of metrics used for measuring performance in natural language processing
%A Blagec, Kathrin
%A Dorffner, Georg
%A Moradi, Milad
%A Ott, Simon
%A Samwald, Matthias
%Y Shavrina, Tatiana
%Y Mikhailov, Vladislav
%Y Malykh, Valentin
%Y Artemova, Ekaterina
%Y Serikov, Oleg
%Y Protasov, Vitaly
%S Proceedings of NLP Power! The First Workshop on Efficient Benchmarking in NLP
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F blagec-etal-2022-global
%X Measuring the performance of natural language processing models is challenging. Traditionally used metrics, such as BLEU and ROUGE, originally devised for machine translation and summarization, have been shown to suffer from low correlation with human judgment and a lack of transferability to other tasks and languages. In the past 15 years, a wide range of alternative metrics have been proposed. However, it is unclear to what extent this has had an impact on NLP benchmarking efforts. Here we provide the first large-scale cross-sectional analysis of metrics used for measuring performance in natural language processing. We curated, mapped and systematized more than 3500 machine learning model performance results from the open repository ‘Papers with Code’ to enable a global and comprehensive analysis. Our results suggest that the large majority of natural language processing metrics currently used have properties that may result in an inadequate reflection of a models’ performance. Furthermore, we found that ambiguities and inconsistencies in the reporting of metrics may lead to difficulties in interpreting and comparing model performances, impairing transparency and reproducibility in NLP research.
%R 10.18653/v1/2022.nlppower-1.6
%U https://aclanthology.org/2022.nlppower-1.6
%U https://doi.org/10.18653/v1/2022.nlppower-1.6
%P 52-63
Markdown (Informal)
[A global analysis of metrics used for measuring performance in natural language processing](https://aclanthology.org/2022.nlppower-1.6) (Blagec et al., nlppower 2022)
ACL