@inproceedings{chinea-rios-etal-2018-automatic,
title = "Are Automatic Metrics Robust and Reliable in Specific Machine Translation Tasks?",
author = "Chinea-Rios, Mara and
Peris, Alvaro and
Casacuberta, Francisco",
editor = "P{\'e}rez-Ortiz, Juan Antonio and
S{\'a}nchez-Mart{\'\i}nez, Felipe and
Espl{\`a}-Gomis, Miquel and
Popovi{\'c}, Maja and
Rico, Celia and
Martins, Andr{\'e} and
Van den Bogaert, Joachim and
Forcada, Mikel L.",
booktitle = "Proceedings of the 21st Annual Conference of the European Association for Machine Translation",
month = may,
year = "2018",
address = "Alicante, Spain",
url = "https://aclanthology.org/2018.eamt-main.9",
pages = "109--118",
abstract = "We present a comparison of automatic metrics against human evaluations of translation quality in several scenarios which were unexplored up to now. Our experimentation was conducted on translation hypotheses that were problematic for the automatic metrics, as the results greatly diverged from one metric to another. We also compared three different translation technologies. Our evaluation shows that in most cases, the metrics capture the human criteria. However, we face failures of the automatic metrics when applied to some domains and systems. Interestingly, we find that automatic metrics applied to the neural machine translation hypotheses provide the most reliable results. Finally, we provide some advice when dealing with these problematic domains.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chinea-rios-etal-2018-automatic">
<titleInfo>
<title>Are Automatic Metrics Robust and Reliable in Specific Machine Translation Tasks?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mara</namePart>
<namePart type="family">Chinea-Rios</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alvaro</namePart>
<namePart type="family">Peris</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francisco</namePart>
<namePart type="family">Casacuberta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 21st Annual Conference of the European Association for Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="given">Antonio</namePart>
<namePart type="family">Pérez-Ortiz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Felipe</namePart>
<namePart type="family">Sánchez-Martínez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miquel</namePart>
<namePart type="family">Esplà-Gomis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maja</namePart>
<namePart type="family">Popović</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Celia</namePart>
<namePart type="family">Rico</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">André</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joachim</namePart>
<namePart type="family">Van den Bogaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mikel</namePart>
<namePart type="given">L</namePart>
<namePart type="family">Forcada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<place>
<placeTerm type="text">Alicante, Spain</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present a comparison of automatic metrics against human evaluations of translation quality in several scenarios which were unexplored up to now. Our experimentation was conducted on translation hypotheses that were problematic for the automatic metrics, as the results greatly diverged from one metric to another. We also compared three different translation technologies. Our evaluation shows that in most cases, the metrics capture the human criteria. However, we face failures of the automatic metrics when applied to some domains and systems. Interestingly, we find that automatic metrics applied to the neural machine translation hypotheses provide the most reliable results. Finally, we provide some advice when dealing with these problematic domains.</abstract>
<identifier type="citekey">chinea-rios-etal-2018-automatic</identifier>
<location>
<url>https://aclanthology.org/2018.eamt-main.9</url>
</location>
<part>
<date>2018-05</date>
<extent unit="page">
<start>109</start>
<end>118</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Are Automatic Metrics Robust and Reliable in Specific Machine Translation Tasks?
%A Chinea-Rios, Mara
%A Peris, Alvaro
%A Casacuberta, Francisco
%Y Pérez-Ortiz, Juan Antonio
%Y Sánchez-Martínez, Felipe
%Y Esplà-Gomis, Miquel
%Y Popović, Maja
%Y Rico, Celia
%Y Martins, André
%Y Van den Bogaert, Joachim
%Y Forcada, Mikel L.
%S Proceedings of the 21st Annual Conference of the European Association for Machine Translation
%D 2018
%8 May
%C Alicante, Spain
%F chinea-rios-etal-2018-automatic
%X We present a comparison of automatic metrics against human evaluations of translation quality in several scenarios which were unexplored up to now. Our experimentation was conducted on translation hypotheses that were problematic for the automatic metrics, as the results greatly diverged from one metric to another. We also compared three different translation technologies. Our evaluation shows that in most cases, the metrics capture the human criteria. However, we face failures of the automatic metrics when applied to some domains and systems. Interestingly, we find that automatic metrics applied to the neural machine translation hypotheses provide the most reliable results. Finally, we provide some advice when dealing with these problematic domains.
%U https://aclanthology.org/2018.eamt-main.9
%P 109-118
Markdown (Informal)
[Are Automatic Metrics Robust and Reliable in Specific Machine Translation Tasks?](https://aclanthology.org/2018.eamt-main.9) (Chinea-Rios et al., EAMT 2018)
ACL