@inproceedings{blouin-etal-2021-transferring,
title = "Transferring Modern Named Entity Recognition to the Historical Domain: How to Take the Step?",
author = "Blouin, Baptiste and
Favre, Benoit and
Auguste, Jeremy and
Henriot, Christian",
editor = {H{\"a}m{\"a}l{\"a}inen, Mika and
Alnajjar, Khalid and
Partanen, Niko and
Rueter, Jack},
booktitle = "Proceedings of the Workshop on Natural Language Processing for Digital Humanities",
month = dec,
year = "2021",
address = "NIT Silchar, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://aclanthology.org/2021.nlp4dh-1.18/",
pages = "152--162",
abstract = "Named entity recognition is of high interest to digital humanities, in particular when mining historical documents. Although the task is mature in the field of NLP, results of contemporary models are not satisfactory on challenging documents corresponding to out-of-domain genres, noisy OCR output, or old-variants of the target language. In this paper we study how model transfer methods, in the context of the aforementioned challenges, can improve historical named entity recognition according to how much effort is allocated to describing the target data, manually annotating small amounts of texts, or matching pre-training resources. In particular, we explore the situation where the class labels, as well as the quality of the documents to be processed, are different in the source and target domains. We perform extensive experiments with the transformer architecture on the LitBank and HIPE historical datasets, with different annotation schemes and character-level noise. They show that annotating 250 sentences can recover 93{\%} of the full-data performance when models are pre-trained, that the choice of self-supervised and target-task pre-training data is crucial in the zero-shot setting, and that OCR errors can be handled by simulating noise on pre-training data and resorting to recent character-aware transformers."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="blouin-etal-2021-transferring">
<titleInfo>
<title>Transferring Modern Named Entity Recognition to the Historical Domain: How to Take the Step?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Baptiste</namePart>
<namePart type="family">Blouin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Benoit</namePart>
<namePart type="family">Favre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jeremy</namePart>
<namePart type="family">Auguste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christian</namePart>
<namePart type="family">Henriot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Natural Language Processing for Digital Humanities</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mika</namePart>
<namePart type="family">Hämäläinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Alnajjar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Niko</namePart>
<namePart type="family">Partanen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jack</namePart>
<namePart type="family">Rueter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>NLP Association of India (NLPAI)</publisher>
<place>
<placeTerm type="text">NIT Silchar, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Named entity recognition is of high interest to digital humanities, in particular when mining historical documents. Although the task is mature in the field of NLP, results of contemporary models are not satisfactory on challenging documents corresponding to out-of-domain genres, noisy OCR output, or old-variants of the target language. In this paper we study how model transfer methods, in the context of the aforementioned challenges, can improve historical named entity recognition according to how much effort is allocated to describing the target data, manually annotating small amounts of texts, or matching pre-training resources. In particular, we explore the situation where the class labels, as well as the quality of the documents to be processed, are different in the source and target domains. We perform extensive experiments with the transformer architecture on the LitBank and HIPE historical datasets, with different annotation schemes and character-level noise. They show that annotating 250 sentences can recover 93% of the full-data performance when models are pre-trained, that the choice of self-supervised and target-task pre-training data is crucial in the zero-shot setting, and that OCR errors can be handled by simulating noise on pre-training data and resorting to recent character-aware transformers.</abstract>
<identifier type="citekey">blouin-etal-2021-transferring</identifier>
<location>
<url>https://aclanthology.org/2021.nlp4dh-1.18/</url>
</location>
<part>
<date>2021-12</date>
<extent unit="page">
<start>152</start>
<end>162</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Transferring Modern Named Entity Recognition to the Historical Domain: How to Take the Step?
%A Blouin, Baptiste
%A Favre, Benoit
%A Auguste, Jeremy
%A Henriot, Christian
%Y Hämäläinen, Mika
%Y Alnajjar, Khalid
%Y Partanen, Niko
%Y Rueter, Jack
%S Proceedings of the Workshop on Natural Language Processing for Digital Humanities
%D 2021
%8 December
%I NLP Association of India (NLPAI)
%C NIT Silchar, India
%F blouin-etal-2021-transferring
%X Named entity recognition is of high interest to digital humanities, in particular when mining historical documents. Although the task is mature in the field of NLP, results of contemporary models are not satisfactory on challenging documents corresponding to out-of-domain genres, noisy OCR output, or old-variants of the target language. In this paper we study how model transfer methods, in the context of the aforementioned challenges, can improve historical named entity recognition according to how much effort is allocated to describing the target data, manually annotating small amounts of texts, or matching pre-training resources. In particular, we explore the situation where the class labels, as well as the quality of the documents to be processed, are different in the source and target domains. We perform extensive experiments with the transformer architecture on the LitBank and HIPE historical datasets, with different annotation schemes and character-level noise. They show that annotating 250 sentences can recover 93% of the full-data performance when models are pre-trained, that the choice of self-supervised and target-task pre-training data is crucial in the zero-shot setting, and that OCR errors can be handled by simulating noise on pre-training data and resorting to recent character-aware transformers.
%U https://aclanthology.org/2021.nlp4dh-1.18/
%P 152-162
Markdown (Informal)
[Transferring Modern Named Entity Recognition to the Historical Domain: How to Take the Step?](https://aclanthology.org/2021.nlp4dh-1.18/) (Blouin et al., NLP4DH 2021)
ACL