@inproceedings{lippincott-van-durme-2021-active,
title = "Active learning and negative evidence for language identification",
author = "Lippincott, Thomas and
Van Durme, Ben",
editor = "Dragut, Eduard and
Li, Yunyao and
Popa, Lucian and
Vucetic, Slobodan",
booktitle = "Proceedings of the Second Workshop on Data Science with Human in the Loop: Language Advances",
month = jun,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.dash-1.8",
doi = "10.18653/v1/2021.dash-1.8",
pages = "47--51",
abstract = "Language identification (LID), the task of determining the natural language of a given text, is an essential first step in most NLP pipelines. While generally a solved problem for documents of sufficient length and languages with ample training data, the proliferation of microblogs and other social media has made it increasingly common to encounter use-cases that *don{'}t* satisfy these conditions. In these situations, the fundamental difficulty is the lack of, and cost of gathering, labeled data: unlike some annotation tasks, no single {``}expert{''} can quickly and reliably identify more than a handful of languages. This leads to a natural question: can we gain useful information when annotators are only able to *rule out* languages for a given document, rather than supply a positive label? What are the optimal choices for gathering and representing such *negative evidence* as a model is trained? In this paper, we demonstrate that using negative evidence can improve the performance of a simple neural LID model. This improvement is sensitive to policies of how the evidence is represented in the loss function, and for deciding which annotators to employ given the instance and model state. We consider simple policies and report experimental results that indicate the optimal choices for this task. We conclude with a discussion of future work to determine if and how the results generalize to other classification tasks.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lippincott-van-durme-2021-active">
<titleInfo>
<title>Active learning and negative evidence for language identification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Lippincott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ben</namePart>
<namePart type="family">Van Durme</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on Data Science with Human in the Loop: Language Advances</title>
</titleInfo>
<name type="personal">
<namePart type="given">Eduard</namePart>
<namePart type="family">Dragut</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yunyao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucian</namePart>
<namePart type="family">Popa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Slobodan</namePart>
<namePart type="family">Vucetic</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Language identification (LID), the task of determining the natural language of a given text, is an essential first step in most NLP pipelines. While generally a solved problem for documents of sufficient length and languages with ample training data, the proliferation of microblogs and other social media has made it increasingly common to encounter use-cases that *don’t* satisfy these conditions. In these situations, the fundamental difficulty is the lack of, and cost of gathering, labeled data: unlike some annotation tasks, no single “expert” can quickly and reliably identify more than a handful of languages. This leads to a natural question: can we gain useful information when annotators are only able to *rule out* languages for a given document, rather than supply a positive label? What are the optimal choices for gathering and representing such *negative evidence* as a model is trained? In this paper, we demonstrate that using negative evidence can improve the performance of a simple neural LID model. This improvement is sensitive to policies of how the evidence is represented in the loss function, and for deciding which annotators to employ given the instance and model state. We consider simple policies and report experimental results that indicate the optimal choices for this task. We conclude with a discussion of future work to determine if and how the results generalize to other classification tasks.</abstract>
<identifier type="citekey">lippincott-van-durme-2021-active</identifier>
<identifier type="doi">10.18653/v1/2021.dash-1.8</identifier>
<location>
<url>https://aclanthology.org/2021.dash-1.8</url>
</location>
<part>
<date>2021-06</date>
<extent unit="page">
<start>47</start>
<end>51</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Active learning and negative evidence for language identification
%A Lippincott, Thomas
%A Van Durme, Ben
%Y Dragut, Eduard
%Y Li, Yunyao
%Y Popa, Lucian
%Y Vucetic, Slobodan
%S Proceedings of the Second Workshop on Data Science with Human in the Loop: Language Advances
%D 2021
%8 June
%I Association for Computational Linguistics
%C Online
%F lippincott-van-durme-2021-active
%X Language identification (LID), the task of determining the natural language of a given text, is an essential first step in most NLP pipelines. While generally a solved problem for documents of sufficient length and languages with ample training data, the proliferation of microblogs and other social media has made it increasingly common to encounter use-cases that *don’t* satisfy these conditions. In these situations, the fundamental difficulty is the lack of, and cost of gathering, labeled data: unlike some annotation tasks, no single “expert” can quickly and reliably identify more than a handful of languages. This leads to a natural question: can we gain useful information when annotators are only able to *rule out* languages for a given document, rather than supply a positive label? What are the optimal choices for gathering and representing such *negative evidence* as a model is trained? In this paper, we demonstrate that using negative evidence can improve the performance of a simple neural LID model. This improvement is sensitive to policies of how the evidence is represented in the loss function, and for deciding which annotators to employ given the instance and model state. We consider simple policies and report experimental results that indicate the optimal choices for this task. We conclude with a discussion of future work to determine if and how the results generalize to other classification tasks.
%R 10.18653/v1/2021.dash-1.8
%U https://aclanthology.org/2021.dash-1.8
%U https://doi.org/10.18653/v1/2021.dash-1.8
%P 47-51
Markdown (Informal)
[Active learning and negative evidence for language identification](https://aclanthology.org/2021.dash-1.8) (Lippincott & Van Durme, DaSH 2021)
ACL