@inproceedings{bingel-haider-2014-named,
title = "Named Entity Tagging a Very Large Unbalanced Corpus: Training and Evaluating {NE} Classifiers",
author = "Bingel, Joachim and
Haider, Thomas",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Declerck, Thierry and
Loftsson, Hrafn and
Maegaard, Bente and
Mariani, Joseph and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Ninth International Conference on Language Resources and Evaluation ({LREC}'14)",
month = may,
year = "2014",
address = "Reykjavik, Iceland",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2014/pdf/967_Paper.pdf",
abstract = "We describe a systematic and application-oriented approach to training and evaluating named entity recognition and classification (NERC) systems, the purpose of which is to identify an optimal system and to train an optimal model for named entity tagging DeReKo, a very large general-purpose corpus of contemporary German (Kupietz et al., 2010). DeReKo {`}s strong dispersion wrt. genre, register and time forces us to base our decision for a specific NERC system on an evaluation performed on a representative sample of DeReKo instead of performance figures that have been reported for the individual NERC systems when evaluated on more uniform and less diverse data. We create and manually annotate such a representative sample as evaluation data for three different NERC systems, for each of which various models are learnt on multiple training data. The proposed sampling method can be viewed as a generally applicable method for sampling evaluation data from an unbalanced target corpus for any sort of natural language processing.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bingel-haider-2014-named">
<titleInfo>
<title>Named Entity Tagging a Very Large Unbalanced Corpus: Training and Evaluating NE Classifiers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Joachim</namePart>
<namePart type="family">Bingel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Haider</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2014-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC’14)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hrafn</namePart>
<namePart type="family">Loftsson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Reykjavik, Iceland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We describe a systematic and application-oriented approach to training and evaluating named entity recognition and classification (NERC) systems, the purpose of which is to identify an optimal system and to train an optimal model for named entity tagging DeReKo, a very large general-purpose corpus of contemporary German (Kupietz et al., 2010). DeReKo ‘s strong dispersion wrt. genre, register and time forces us to base our decision for a specific NERC system on an evaluation performed on a representative sample of DeReKo instead of performance figures that have been reported for the individual NERC systems when evaluated on more uniform and less diverse data. We create and manually annotate such a representative sample as evaluation data for three different NERC systems, for each of which various models are learnt on multiple training data. The proposed sampling method can be viewed as a generally applicable method for sampling evaluation data from an unbalanced target corpus for any sort of natural language processing.</abstract>
<identifier type="citekey">bingel-haider-2014-named</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2014/pdf/967_Paper.pdf</url>
</location>
<part>
<date>2014-05</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Named Entity Tagging a Very Large Unbalanced Corpus: Training and Evaluating NE Classifiers
%A Bingel, Joachim
%A Haider, Thomas
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Declerck, Thierry
%Y Loftsson, Hrafn
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC’14)
%D 2014
%8 May
%I European Language Resources Association (ELRA)
%C Reykjavik, Iceland
%F bingel-haider-2014-named
%X We describe a systematic and application-oriented approach to training and evaluating named entity recognition and classification (NERC) systems, the purpose of which is to identify an optimal system and to train an optimal model for named entity tagging DeReKo, a very large general-purpose corpus of contemporary German (Kupietz et al., 2010). DeReKo ‘s strong dispersion wrt. genre, register and time forces us to base our decision for a specific NERC system on an evaluation performed on a representative sample of DeReKo instead of performance figures that have been reported for the individual NERC systems when evaluated on more uniform and less diverse data. We create and manually annotate such a representative sample as evaluation data for three different NERC systems, for each of which various models are learnt on multiple training data. The proposed sampling method can be viewed as a generally applicable method for sampling evaluation data from an unbalanced target corpus for any sort of natural language processing.
%U http://www.lrec-conf.org/proceedings/lrec2014/pdf/967_Paper.pdf
Markdown (Informal)
[Named Entity Tagging a Very Large Unbalanced Corpus: Training and Evaluating NE Classifiers](http://www.lrec-conf.org/proceedings/lrec2014/pdf/967_Paper.pdf) (Bingel & Haider, LREC 2014)
ACL