@inproceedings{han-etal-2021-cushlepor,
title = "cush{LEPOR}: customising h{LEPOR} metric using Optuna for higher agreement with human judgments or pre-trained language model {L}a{BSE}",
author = "Han, Lifeng and
Sorokina, Irina and
Erofeev, Gleb and
Gladkoff, Serge",
editor = "Barrault, Loic and
Bojar, Ondrej and
Bougares, Fethi and
Chatterjee, Rajen and
Costa-jussa, Marta R. and
Federmann, Christian and
Fishel, Mark and
Fraser, Alexander and
Freitag, Markus and
Graham, Yvette and
Grundkiewicz, Roman and
Guzman, Paco and
Haddow, Barry and
Huck, Matthias and
Yepes, Antonio Jimeno and
Koehn, Philipp and
Kocmi, Tom and
Martins, Andre and
Morishita, Makoto and
Monz, Christof",
booktitle = "Proceedings of the Sixth Conference on Machine Translation",
month = nov,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.wmt-1.109",
pages = "1014--1023",
abstract = "Human evaluation has always been expensive while researchers struggle to trust the automatic metrics. To address this, we propose to customise traditional metrics by taking advantages of the pre-trained language models (PLMs) and the limited available human labelled scores. We first re-introduce the hLEPOR metric factors, followed by the Python version we developed (ported) which achieved the automatic tuning of the weighting parameters in hLEPOR metric. Then we present the customised hLEPOR (cushLEPOR) which uses Optuna hyper-parameter optimisation framework to fine-tune hLEPOR weighting parameters towards better agreement to pre-trained language models (using LaBSE) regarding the exact MT language pairs that cushLEPOR is deployed to. We also optimise cushLEPOR towards professional human evaluation data based on MQM and pSQM framework on English-German and Chinese-English language pairs. The experimental investigations show cushLEPOR boosts hLEPOR performances towards better agreements to PLMs like LABSE with much lower cost, and better agreements to human evaluations including MQM and pSQM scores, and yields much better performances than BLEU. Official results show that our submissions win three language pairs including English-German and Chinese-English on News domain via cushLEPOR(LM) and English-Russian on TED domain via hLEPOR. (data available at \url{https://github.com/poethan/cushLEPOR})",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="han-etal-2021-cushlepor">
<titleInfo>
<title>cushLEPOR: customising hLEPOR metric using Optuna for higher agreement with human judgments or pre-trained language model LaBSE</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lifeng</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Irina</namePart>
<namePart type="family">Sorokina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gleb</namePart>
<namePart type="family">Erofeev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Serge</namePart>
<namePart type="family">Gladkoff</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Sixth Conference on Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Loic</namePart>
<namePart type="family">Barrault</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ondrej</namePart>
<namePart type="family">Bojar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fethi</namePart>
<namePart type="family">Bougares</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rajen</namePart>
<namePart type="family">Chatterjee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marta</namePart>
<namePart type="given">R</namePart>
<namePart type="family">Costa-jussa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christian</namePart>
<namePart type="family">Federmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Fishel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexander</namePart>
<namePart type="family">Fraser</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Markus</namePart>
<namePart type="family">Freitag</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yvette</namePart>
<namePart type="family">Graham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roman</namePart>
<namePart type="family">Grundkiewicz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paco</namePart>
<namePart type="family">Guzman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barry</namePart>
<namePart type="family">Haddow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthias</namePart>
<namePart type="family">Huck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonio</namePart>
<namePart type="given">Jimeno</namePart>
<namePart type="family">Yepes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Koehn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tom</namePart>
<namePart type="family">Kocmi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Makoto</namePart>
<namePart type="family">Morishita</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christof</namePart>
<namePart type="family">Monz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Human evaluation has always been expensive while researchers struggle to trust the automatic metrics. To address this, we propose to customise traditional metrics by taking advantages of the pre-trained language models (PLMs) and the limited available human labelled scores. We first re-introduce the hLEPOR metric factors, followed by the Python version we developed (ported) which achieved the automatic tuning of the weighting parameters in hLEPOR metric. Then we present the customised hLEPOR (cushLEPOR) which uses Optuna hyper-parameter optimisation framework to fine-tune hLEPOR weighting parameters towards better agreement to pre-trained language models (using LaBSE) regarding the exact MT language pairs that cushLEPOR is deployed to. We also optimise cushLEPOR towards professional human evaluation data based on MQM and pSQM framework on English-German and Chinese-English language pairs. The experimental investigations show cushLEPOR boosts hLEPOR performances towards better agreements to PLMs like LABSE with much lower cost, and better agreements to human evaluations including MQM and pSQM scores, and yields much better performances than BLEU. Official results show that our submissions win three language pairs including English-German and Chinese-English on News domain via cushLEPOR(LM) and English-Russian on TED domain via hLEPOR. (data available at https://github.com/poethan/cushLEPOR)</abstract>
<identifier type="citekey">han-etal-2021-cushlepor</identifier>
<location>
<url>https://aclanthology.org/2021.wmt-1.109</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>1014</start>
<end>1023</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T cushLEPOR: customising hLEPOR metric using Optuna for higher agreement with human judgments or pre-trained language model LaBSE
%A Han, Lifeng
%A Sorokina, Irina
%A Erofeev, Gleb
%A Gladkoff, Serge
%Y Barrault, Loic
%Y Bojar, Ondrej
%Y Bougares, Fethi
%Y Chatterjee, Rajen
%Y Costa-jussa, Marta R.
%Y Federmann, Christian
%Y Fishel, Mark
%Y Fraser, Alexander
%Y Freitag, Markus
%Y Graham, Yvette
%Y Grundkiewicz, Roman
%Y Guzman, Paco
%Y Haddow, Barry
%Y Huck, Matthias
%Y Yepes, Antonio Jimeno
%Y Koehn, Philipp
%Y Kocmi, Tom
%Y Martins, Andre
%Y Morishita, Makoto
%Y Monz, Christof
%S Proceedings of the Sixth Conference on Machine Translation
%D 2021
%8 November
%I Association for Computational Linguistics
%C Online
%F han-etal-2021-cushlepor
%X Human evaluation has always been expensive while researchers struggle to trust the automatic metrics. To address this, we propose to customise traditional metrics by taking advantages of the pre-trained language models (PLMs) and the limited available human labelled scores. We first re-introduce the hLEPOR metric factors, followed by the Python version we developed (ported) which achieved the automatic tuning of the weighting parameters in hLEPOR metric. Then we present the customised hLEPOR (cushLEPOR) which uses Optuna hyper-parameter optimisation framework to fine-tune hLEPOR weighting parameters towards better agreement to pre-trained language models (using LaBSE) regarding the exact MT language pairs that cushLEPOR is deployed to. We also optimise cushLEPOR towards professional human evaluation data based on MQM and pSQM framework on English-German and Chinese-English language pairs. The experimental investigations show cushLEPOR boosts hLEPOR performances towards better agreements to PLMs like LABSE with much lower cost, and better agreements to human evaluations including MQM and pSQM scores, and yields much better performances than BLEU. Official results show that our submissions win three language pairs including English-German and Chinese-English on News domain via cushLEPOR(LM) and English-Russian on TED domain via hLEPOR. (data available at https://github.com/poethan/cushLEPOR)
%U https://aclanthology.org/2021.wmt-1.109
%P 1014-1023
Markdown (Informal)
[cushLEPOR: customising hLEPOR metric using Optuna for higher agreement with human judgments or pre-trained language model LaBSE](https://aclanthology.org/2021.wmt-1.109) (Han et al., WMT 2021)
ACL