@inproceedings{rubinstein-shmidman-2021-nlp,
title = "{NLP} in the {DH} pipeline: Transfer-learning to a Chronolect",
author = "Rubinstein, Aynat and
Shmidman, Avi",
editor = {H{\"a}m{\"a}l{\"a}inen, Mika and
Alnajjar, Khalid and
Partanen, Niko and
Rueter, Jack},
booktitle = "Proceedings of the Workshop on Natural Language Processing for Digital Humanities",
month = dec,
year = "2021",
address = "NIT Silchar, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://aclanthology.org/2021.nlp4dh-1.12/",
pages = "106--110",
abstract = "A big unknown in Digital Humanities (DH) projects that seek to analyze previously untouched corpora is the question of how to adapt existing Natural Language Processing (NLP) resources to the specific nature of the target corpus. In this paper, we study the case of Emergent Modern Hebrew (EMH), an under-resourced chronolect of the Hebrew language. The resource we seek to adapt, a diacritizer, exists for both earlier and later chronolects of the language. Given a small annotated corpus of our target chronolect, we demonstrate that applying transfer-learning from either of the chronolects is preferable to training a new model from scratch. Furthermore, we consider just how much annotated data is necessary. For our task, we find that even a minimal corpus of 50K tokens provides a noticeable gain in accuracy. At the same time, we also evaluate accuracy at three additional increments, in order to quantify the gains that can be expected by investing in a larger annotated corpus."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="rubinstein-shmidman-2021-nlp">
<titleInfo>
<title>NLP in the DH pipeline: Transfer-learning to a Chronolect</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aynat</namePart>
<namePart type="family">Rubinstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Avi</namePart>
<namePart type="family">Shmidman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Natural Language Processing for Digital Humanities</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mika</namePart>
<namePart type="family">Hämäläinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Alnajjar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Niko</namePart>
<namePart type="family">Partanen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jack</namePart>
<namePart type="family">Rueter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>NLP Association of India (NLPAI)</publisher>
<place>
<placeTerm type="text">NIT Silchar, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>A big unknown in Digital Humanities (DH) projects that seek to analyze previously untouched corpora is the question of how to adapt existing Natural Language Processing (NLP) resources to the specific nature of the target corpus. In this paper, we study the case of Emergent Modern Hebrew (EMH), an under-resourced chronolect of the Hebrew language. The resource we seek to adapt, a diacritizer, exists for both earlier and later chronolects of the language. Given a small annotated corpus of our target chronolect, we demonstrate that applying transfer-learning from either of the chronolects is preferable to training a new model from scratch. Furthermore, we consider just how much annotated data is necessary. For our task, we find that even a minimal corpus of 50K tokens provides a noticeable gain in accuracy. At the same time, we also evaluate accuracy at three additional increments, in order to quantify the gains that can be expected by investing in a larger annotated corpus.</abstract>
<identifier type="citekey">rubinstein-shmidman-2021-nlp</identifier>
<location>
<url>https://aclanthology.org/2021.nlp4dh-1.12/</url>
</location>
<part>
<date>2021-12</date>
<extent unit="page">
<start>106</start>
<end>110</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T NLP in the DH pipeline: Transfer-learning to a Chronolect
%A Rubinstein, Aynat
%A Shmidman, Avi
%Y Hämäläinen, Mika
%Y Alnajjar, Khalid
%Y Partanen, Niko
%Y Rueter, Jack
%S Proceedings of the Workshop on Natural Language Processing for Digital Humanities
%D 2021
%8 December
%I NLP Association of India (NLPAI)
%C NIT Silchar, India
%F rubinstein-shmidman-2021-nlp
%X A big unknown in Digital Humanities (DH) projects that seek to analyze previously untouched corpora is the question of how to adapt existing Natural Language Processing (NLP) resources to the specific nature of the target corpus. In this paper, we study the case of Emergent Modern Hebrew (EMH), an under-resourced chronolect of the Hebrew language. The resource we seek to adapt, a diacritizer, exists for both earlier and later chronolects of the language. Given a small annotated corpus of our target chronolect, we demonstrate that applying transfer-learning from either of the chronolects is preferable to training a new model from scratch. Furthermore, we consider just how much annotated data is necessary. For our task, we find that even a minimal corpus of 50K tokens provides a noticeable gain in accuracy. At the same time, we also evaluate accuracy at three additional increments, in order to quantify the gains that can be expected by investing in a larger annotated corpus.
%U https://aclanthology.org/2021.nlp4dh-1.12/
%P 106-110
Markdown (Informal)
[NLP in the DH pipeline: Transfer-learning to a Chronolect](https://aclanthology.org/2021.nlp4dh-1.12/) (Rubinstein & Shmidman, NLP4DH 2021)
ACL