@inproceedings{guo-etal-2022-prediction,
title = "Prediction Difference Regularization against Perturbation for Neural Machine Translation",
author = "Guo, Dengji and
Ma, Zhengrui and
Zhang, Min and
Feng, Yang",
editor = "Muresan, Smaranda and
Nakov, Preslav and
Villavicencio, Aline",
booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.acl-long.528/",
doi = "10.18653/v1/2022.acl-long.528",
pages = "7665--7675",
abstract = "Regularization methods applying input perturbation have drawn considerable attention and have been frequently explored for NMT tasks in recent years. Despite their simplicity and effectiveness, we argue that these methods are limited by the under-fitting of training data. In this paper, we utilize prediction difference for ground-truth tokens to analyze the fitting of token-level samples and find that under-fitting is almost as common as over-fitting. We introduce prediction difference regularization (PD-R), a simple and effective method that can reduce over-fitting and under-fitting at the same time. For all token-level samples, PD-R minimizes the prediction difference between the original pass and the input-perturbed pass, making the model less sensitive to small input changes, thus more robust to both perturbations and under-fitted training data. Experiments on three widely used WMT translation tasks show that our approach can significantly improve over existing perturbation regularization methods. On WMT16 En-De task, our model achieves 1.80 SacreBLEU improvement over vanilla transformer."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="guo-etal-2022-prediction">
<titleInfo>
<title>Prediction Difference Regularization against Perturbation for Neural Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dengji</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhengrui</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Feng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Smaranda</namePart>
<namePart type="family">Muresan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aline</namePart>
<namePart type="family">Villavicencio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Regularization methods applying input perturbation have drawn considerable attention and have been frequently explored for NMT tasks in recent years. Despite their simplicity and effectiveness, we argue that these methods are limited by the under-fitting of training data. In this paper, we utilize prediction difference for ground-truth tokens to analyze the fitting of token-level samples and find that under-fitting is almost as common as over-fitting. We introduce prediction difference regularization (PD-R), a simple and effective method that can reduce over-fitting and under-fitting at the same time. For all token-level samples, PD-R minimizes the prediction difference between the original pass and the input-perturbed pass, making the model less sensitive to small input changes, thus more robust to both perturbations and under-fitted training data. Experiments on three widely used WMT translation tasks show that our approach can significantly improve over existing perturbation regularization methods. On WMT16 En-De task, our model achieves 1.80 SacreBLEU improvement over vanilla transformer.</abstract>
<identifier type="citekey">guo-etal-2022-prediction</identifier>
<identifier type="doi">10.18653/v1/2022.acl-long.528</identifier>
<location>
<url>https://aclanthology.org/2022.acl-long.528/</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>7665</start>
<end>7675</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Prediction Difference Regularization against Perturbation for Neural Machine Translation
%A Guo, Dengji
%A Ma, Zhengrui
%A Zhang, Min
%A Feng, Yang
%Y Muresan, Smaranda
%Y Nakov, Preslav
%Y Villavicencio, Aline
%S Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F guo-etal-2022-prediction
%X Regularization methods applying input perturbation have drawn considerable attention and have been frequently explored for NMT tasks in recent years. Despite their simplicity and effectiveness, we argue that these methods are limited by the under-fitting of training data. In this paper, we utilize prediction difference for ground-truth tokens to analyze the fitting of token-level samples and find that under-fitting is almost as common as over-fitting. We introduce prediction difference regularization (PD-R), a simple and effective method that can reduce over-fitting and under-fitting at the same time. For all token-level samples, PD-R minimizes the prediction difference between the original pass and the input-perturbed pass, making the model less sensitive to small input changes, thus more robust to both perturbations and under-fitted training data. Experiments on three widely used WMT translation tasks show that our approach can significantly improve over existing perturbation regularization methods. On WMT16 En-De task, our model achieves 1.80 SacreBLEU improvement over vanilla transformer.
%R 10.18653/v1/2022.acl-long.528
%U https://aclanthology.org/2022.acl-long.528/
%U https://doi.org/10.18653/v1/2022.acl-long.528
%P 7665-7675
Markdown (Informal)
[Prediction Difference Regularization against Perturbation for Neural Machine Translation](https://aclanthology.org/2022.acl-long.528/) (Guo et al., ACL 2022)
ACL