@inproceedings{sileo-2024-tasksource,
title = "tasksource: A Large Collection of {NLP} tasks with a Structured Dataset Preprocessing Framework",
author = "Sileo, Damien",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.1361/",
pages = "15655--15684",
abstract = "The HuggingFace Datasets Hub hosts thousands of datasets, offering exciting opportunities for language model training and evaluation. However, datasets for a specific task type often have different structures, making harmonization challenging which prevents the interchangeable use of comparable datasets. As a result, multi-task training or evaluation necessitates manual work to fit data into task templates. Several initiatives independently tackle this issue by releasing harmonized datasets or providing harmonization codes to preprocess datasets into a consistent format. We identify patterns in such preprocessings, such as column renaming, or more complex patterns. We then propose an annotation framework that enables concise, readable, and reusable preprocessing annotations. tasksource annotates more than 600 task preprocessings and provides a backend to automate dataset alignment. We fine-tune a multi-task text encoder on all tasksource tasks, outperforming every publicly available text encoder of comparable parameter count according to an external evaluation."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sileo-2024-tasksource">
<titleInfo>
<title>tasksource: A Large Collection of NLP tasks with a Structured Dataset Preprocessing Framework</title>
</titleInfo>
<name type="personal">
<namePart type="given">Damien</namePart>
<namePart type="family">Sileo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The HuggingFace Datasets Hub hosts thousands of datasets, offering exciting opportunities for language model training and evaluation. However, datasets for a specific task type often have different structures, making harmonization challenging which prevents the interchangeable use of comparable datasets. As a result, multi-task training or evaluation necessitates manual work to fit data into task templates. Several initiatives independently tackle this issue by releasing harmonized datasets or providing harmonization codes to preprocess datasets into a consistent format. We identify patterns in such preprocessings, such as column renaming, or more complex patterns. We then propose an annotation framework that enables concise, readable, and reusable preprocessing annotations. tasksource annotates more than 600 task preprocessings and provides a backend to automate dataset alignment. We fine-tune a multi-task text encoder on all tasksource tasks, outperforming every publicly available text encoder of comparable parameter count according to an external evaluation.</abstract>
<identifier type="citekey">sileo-2024-tasksource</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.1361/</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>15655</start>
<end>15684</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T tasksource: A Large Collection of NLP tasks with a Structured Dataset Preprocessing Framework
%A Sileo, Damien
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F sileo-2024-tasksource
%X The HuggingFace Datasets Hub hosts thousands of datasets, offering exciting opportunities for language model training and evaluation. However, datasets for a specific task type often have different structures, making harmonization challenging which prevents the interchangeable use of comparable datasets. As a result, multi-task training or evaluation necessitates manual work to fit data into task templates. Several initiatives independently tackle this issue by releasing harmonized datasets or providing harmonization codes to preprocess datasets into a consistent format. We identify patterns in such preprocessings, such as column renaming, or more complex patterns. We then propose an annotation framework that enables concise, readable, and reusable preprocessing annotations. tasksource annotates more than 600 task preprocessings and provides a backend to automate dataset alignment. We fine-tune a multi-task text encoder on all tasksource tasks, outperforming every publicly available text encoder of comparable parameter count according to an external evaluation.
%U https://aclanthology.org/2024.lrec-main.1361/
%P 15655-15684
Markdown (Informal)
[tasksource: A Large Collection of NLP tasks with a Structured Dataset Preprocessing Framework](https://aclanthology.org/2024.lrec-main.1361/) (Sileo, LREC-COLING 2024)
ACL