@inproceedings{du-flanigan-2021-avoiding,
title = "Avoiding Overlap in Data Augmentation for {AMR}-to-Text Generation",
author = "Du, Wenchao and
Flanigan, Jeffrey",
editor = "Zong, Chengqing and
Xia, Fei and
Li, Wenjie and
Navigli, Roberto",
booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.acl-short.132/",
doi = "10.18653/v1/2021.acl-short.132",
pages = "1043--1048",
abstract = "Leveraging additional unlabeled data to boost model performance is common practice in machine learning and natural language processing. For generation tasks, if there is overlap between the additional data and the target text evaluation data, then training on the additional data is training on answers of the test set. This leads to overly-inflated scores with the additional data compared to real-world testing scenarios and problems when comparing models. We study the AMR dataset and Gigaword, which is popularly used for improving AMR-to-text generators, and find significant overlap between Gigaword and a subset of the AMR dataset. We propose methods for excluding parts of Gigaword to remove this overlap, and show that our approach leads to a more realistic evaluation of the task of AMR-to-text generation. Going forward, we give simple best-practice recommendations for leveraging additional data in AMR-to-text generation."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="du-flanigan-2021-avoiding">
<titleInfo>
<title>Avoiding Overlap in Data Augmentation for AMR-to-Text Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wenchao</namePart>
<namePart type="family">Du</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jeffrey</namePart>
<namePart type="family">Flanigan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chengqing</namePart>
<namePart type="family">Zong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fei</namePart>
<namePart type="family">Xia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenjie</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roberto</namePart>
<namePart type="family">Navigli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Leveraging additional unlabeled data to boost model performance is common practice in machine learning and natural language processing. For generation tasks, if there is overlap between the additional data and the target text evaluation data, then training on the additional data is training on answers of the test set. This leads to overly-inflated scores with the additional data compared to real-world testing scenarios and problems when comparing models. We study the AMR dataset and Gigaword, which is popularly used for improving AMR-to-text generators, and find significant overlap between Gigaword and a subset of the AMR dataset. We propose methods for excluding parts of Gigaword to remove this overlap, and show that our approach leads to a more realistic evaluation of the task of AMR-to-text generation. Going forward, we give simple best-practice recommendations for leveraging additional data in AMR-to-text generation.</abstract>
<identifier type="citekey">du-flanigan-2021-avoiding</identifier>
<identifier type="doi">10.18653/v1/2021.acl-short.132</identifier>
<location>
<url>https://aclanthology.org/2021.acl-short.132/</url>
</location>
<part>
<date>2021-08</date>
<extent unit="page">
<start>1043</start>
<end>1048</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Avoiding Overlap in Data Augmentation for AMR-to-Text Generation
%A Du, Wenchao
%A Flanigan, Jeffrey
%Y Zong, Chengqing
%Y Xia, Fei
%Y Li, Wenjie
%Y Navigli, Roberto
%S Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)
%D 2021
%8 August
%I Association for Computational Linguistics
%C Online
%F du-flanigan-2021-avoiding
%X Leveraging additional unlabeled data to boost model performance is common practice in machine learning and natural language processing. For generation tasks, if there is overlap between the additional data and the target text evaluation data, then training on the additional data is training on answers of the test set. This leads to overly-inflated scores with the additional data compared to real-world testing scenarios and problems when comparing models. We study the AMR dataset and Gigaword, which is popularly used for improving AMR-to-text generators, and find significant overlap between Gigaword and a subset of the AMR dataset. We propose methods for excluding parts of Gigaword to remove this overlap, and show that our approach leads to a more realistic evaluation of the task of AMR-to-text generation. Going forward, we give simple best-practice recommendations for leveraging additional data in AMR-to-text generation.
%R 10.18653/v1/2021.acl-short.132
%U https://aclanthology.org/2021.acl-short.132/
%U https://doi.org/10.18653/v1/2021.acl-short.132
%P 1043-1048
Markdown (Informal)
[Avoiding Overlap in Data Augmentation for AMR-to-Text Generation](https://aclanthology.org/2021.acl-short.132/) (Du & Flanigan, ACL-IJCNLP 2021)
ACL
- Wenchao Du and Jeffrey Flanigan. 2021. Avoiding Overlap in Data Augmentation for AMR-to-Text Generation. In Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 2: Short Papers), pages 1043–1048, Online. Association for Computational Linguistics.