@inproceedings{casola-etal-2022-whats,
title = "What{'}s in a (dataset{'}s) name? The case of {B}ig{P}atent",
author = "Casola, Silvia and
Lavelli, Alberto and
Saggion, Horacio",
editor = "Bosselut, Antoine and
Chandu, Khyathi and
Dhole, Kaustubh and
Gangal, Varun and
Gehrmann, Sebastian and
Jernite, Yacine and
Novikova, Jekaterina and
Perez-Beltrachini, Laura",
booktitle = "Proceedings of the 2nd Workshop on Natural Language Generation, Evaluation, and Metrics (GEM)",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates (Hybrid)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.gem-1.34",
doi = "10.18653/v1/2022.gem-1.34",
pages = "399--404",
abstract = "Sharing datasets and benchmarks has been crucial for rapidly improving Natural Language Processing models and systems. Documenting datasets{'} characteristics (and any modification introduced over time) is equally important to avoid confusion and make comparisons reliable. Here, we describe the case of BigPatent, a dataset for patent summarization that exists in at least two rather different versions under the same name. While previous literature has not clearly distinguished among versions, their differences do not only lay on a surface level but also modify the dataset{'}s core nature and, thus, the complexity of the summarization task. While this paper describes a specific case, we aim to shed light on new challenges that might emerge in resource sharing and advocate for comprehensive documentation of datasets and models.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="casola-etal-2022-whats">
<titleInfo>
<title>What’s in a (dataset’s) name? The case of BigPatent</title>
</titleInfo>
<name type="personal">
<namePart type="given">Silvia</namePart>
<namePart type="family">Casola</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alberto</namePart>
<namePart type="family">Lavelli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Horacio</namePart>
<namePart type="family">Saggion</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on Natural Language Generation, Evaluation, and Metrics (GEM)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Antoine</namePart>
<namePart type="family">Bosselut</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khyathi</namePart>
<namePart type="family">Chandu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaustubh</namePart>
<namePart type="family">Dhole</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Varun</namePart>
<namePart type="family">Gangal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Gehrmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yacine</namePart>
<namePart type="family">Jernite</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jekaterina</namePart>
<namePart type="family">Novikova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Laura</namePart>
<namePart type="family">Perez-Beltrachini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates (Hybrid)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Sharing datasets and benchmarks has been crucial for rapidly improving Natural Language Processing models and systems. Documenting datasets’ characteristics (and any modification introduced over time) is equally important to avoid confusion and make comparisons reliable. Here, we describe the case of BigPatent, a dataset for patent summarization that exists in at least two rather different versions under the same name. While previous literature has not clearly distinguished among versions, their differences do not only lay on a surface level but also modify the dataset’s core nature and, thus, the complexity of the summarization task. While this paper describes a specific case, we aim to shed light on new challenges that might emerge in resource sharing and advocate for comprehensive documentation of datasets and models.</abstract>
<identifier type="citekey">casola-etal-2022-whats</identifier>
<identifier type="doi">10.18653/v1/2022.gem-1.34</identifier>
<location>
<url>https://aclanthology.org/2022.gem-1.34</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>399</start>
<end>404</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T What’s in a (dataset’s) name? The case of BigPatent
%A Casola, Silvia
%A Lavelli, Alberto
%A Saggion, Horacio
%Y Bosselut, Antoine
%Y Chandu, Khyathi
%Y Dhole, Kaustubh
%Y Gangal, Varun
%Y Gehrmann, Sebastian
%Y Jernite, Yacine
%Y Novikova, Jekaterina
%Y Perez-Beltrachini, Laura
%S Proceedings of the 2nd Workshop on Natural Language Generation, Evaluation, and Metrics (GEM)
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates (Hybrid)
%F casola-etal-2022-whats
%X Sharing datasets and benchmarks has been crucial for rapidly improving Natural Language Processing models and systems. Documenting datasets’ characteristics (and any modification introduced over time) is equally important to avoid confusion and make comparisons reliable. Here, we describe the case of BigPatent, a dataset for patent summarization that exists in at least two rather different versions under the same name. While previous literature has not clearly distinguished among versions, their differences do not only lay on a surface level but also modify the dataset’s core nature and, thus, the complexity of the summarization task. While this paper describes a specific case, we aim to shed light on new challenges that might emerge in resource sharing and advocate for comprehensive documentation of datasets and models.
%R 10.18653/v1/2022.gem-1.34
%U https://aclanthology.org/2022.gem-1.34
%U https://doi.org/10.18653/v1/2022.gem-1.34
%P 399-404
Markdown (Informal)
[What’s in a (dataset’s) name? The case of BigPatent](https://aclanthology.org/2022.gem-1.34) (Casola et al., GEM 2022)
ACL
- Silvia Casola, Alberto Lavelli, and Horacio Saggion. 2022. What’s in a (dataset’s) name? The case of BigPatent. In Proceedings of the 2nd Workshop on Natural Language Generation, Evaluation, and Metrics (GEM), pages 399–404, Abu Dhabi, United Arab Emirates (Hybrid). Association for Computational Linguistics.