@inproceedings{muthuraman-etal-2021-data,
title = "Data Cleaning Tools for Token Classification Tasks",
author = "Muthuraman, Karthik and
Reiss, Frederick and
Xu, Hong and
Cutler, Bryan and
Eichenberger, Zachary",
editor = "Dragut, Eduard and
Li, Yunyao and
Popa, Lucian and
Vucetic, Slobodan",
booktitle = "Proceedings of the Second Workshop on Data Science with Human in the Loop: Language Advances",
month = jun,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.dash-1.10",
doi = "10.18653/v1/2021.dash-1.10",
pages = "59--61",
abstract = "Human-in-the-loop systems for cleaning NLP training data rely on automated sieves to isolate potentially-incorrect labels for manual review. We have developed a novel technique for flagging potentially-incorrect labels with high sensitivity in named entity recognition corpora. We incorporated our sieve into an end-to-end system for cleaning NLP corpora, implemented as a modular collection of Jupyter notebooks built on extensions to the Pandas DataFrame library. We used this system to identify incorrect labels in the CoNLL-2003 corpus for English-language named entity recognition (NER), one of the most influential corpora for NER model research. Unlike previous work that only looked at a subset of the corpus{'}s validation fold, our automated sieve enabled us to examine the entire corpus in depth. Across the entire CoNLL-2003 corpus, we identified over 1300 incorrect labels (out of 35089 in the corpus). We have published our corrections, along with the code we used in our experiments. We are developing a repeatable version of the process we used on the CoNLL-2003 corpus as an open-source library.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="muthuraman-etal-2021-data">
<titleInfo>
<title>Data Cleaning Tools for Token Classification Tasks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Karthik</namePart>
<namePart type="family">Muthuraman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frederick</namePart>
<namePart type="family">Reiss</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hong</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bryan</namePart>
<namePart type="family">Cutler</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zachary</namePart>
<namePart type="family">Eichenberger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on Data Science with Human in the Loop: Language Advances</title>
</titleInfo>
<name type="personal">
<namePart type="given">Eduard</namePart>
<namePart type="family">Dragut</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yunyao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucian</namePart>
<namePart type="family">Popa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Slobodan</namePart>
<namePart type="family">Vucetic</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Human-in-the-loop systems for cleaning NLP training data rely on automated sieves to isolate potentially-incorrect labels for manual review. We have developed a novel technique for flagging potentially-incorrect labels with high sensitivity in named entity recognition corpora. We incorporated our sieve into an end-to-end system for cleaning NLP corpora, implemented as a modular collection of Jupyter notebooks built on extensions to the Pandas DataFrame library. We used this system to identify incorrect labels in the CoNLL-2003 corpus for English-language named entity recognition (NER), one of the most influential corpora for NER model research. Unlike previous work that only looked at a subset of the corpus’s validation fold, our automated sieve enabled us to examine the entire corpus in depth. Across the entire CoNLL-2003 corpus, we identified over 1300 incorrect labels (out of 35089 in the corpus). We have published our corrections, along with the code we used in our experiments. We are developing a repeatable version of the process we used on the CoNLL-2003 corpus as an open-source library.</abstract>
<identifier type="citekey">muthuraman-etal-2021-data</identifier>
<identifier type="doi">10.18653/v1/2021.dash-1.10</identifier>
<location>
<url>https://aclanthology.org/2021.dash-1.10</url>
</location>
<part>
<date>2021-06</date>
<extent unit="page">
<start>59</start>
<end>61</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Data Cleaning Tools for Token Classification Tasks
%A Muthuraman, Karthik
%A Reiss, Frederick
%A Xu, Hong
%A Cutler, Bryan
%A Eichenberger, Zachary
%Y Dragut, Eduard
%Y Li, Yunyao
%Y Popa, Lucian
%Y Vucetic, Slobodan
%S Proceedings of the Second Workshop on Data Science with Human in the Loop: Language Advances
%D 2021
%8 June
%I Association for Computational Linguistics
%C Online
%F muthuraman-etal-2021-data
%X Human-in-the-loop systems for cleaning NLP training data rely on automated sieves to isolate potentially-incorrect labels for manual review. We have developed a novel technique for flagging potentially-incorrect labels with high sensitivity in named entity recognition corpora. We incorporated our sieve into an end-to-end system for cleaning NLP corpora, implemented as a modular collection of Jupyter notebooks built on extensions to the Pandas DataFrame library. We used this system to identify incorrect labels in the CoNLL-2003 corpus for English-language named entity recognition (NER), one of the most influential corpora for NER model research. Unlike previous work that only looked at a subset of the corpus’s validation fold, our automated sieve enabled us to examine the entire corpus in depth. Across the entire CoNLL-2003 corpus, we identified over 1300 incorrect labels (out of 35089 in the corpus). We have published our corrections, along with the code we used in our experiments. We are developing a repeatable version of the process we used on the CoNLL-2003 corpus as an open-source library.
%R 10.18653/v1/2021.dash-1.10
%U https://aclanthology.org/2021.dash-1.10
%U https://doi.org/10.18653/v1/2021.dash-1.10
%P 59-61
Markdown (Informal)
[Data Cleaning Tools for Token Classification Tasks](https://aclanthology.org/2021.dash-1.10) (Muthuraman et al., DaSH 2021)
ACL
- Karthik Muthuraman, Frederick Reiss, Hong Xu, Bryan Cutler, and Zachary Eichenberger. 2021. Data Cleaning Tools for Token Classification Tasks. In Proceedings of the Second Workshop on Data Science with Human in the Loop: Language Advances, pages 59–61, Online. Association for Computational Linguistics.