@inproceedings{atnashev-etal-2022-razmecheno,
title = "Razmecheno: Named Entity Recognition from Digital Archive of Diaries {\textquotedblleft}Prozhito{\textquotedblright}",
author = "Atnashev, Timofey and
Ganeeva, Veronika and
Kazakov, Roman and
Matyash, Daria and
Sonkin, Michael and
Voloshina, Ekaterina and
Serikov, Oleg and
Artemova, Ekaterina",
booktitle = "Proceedings of the Fifth International Conference on Computational Linguistics in Bulgaria (CLIB 2022)",
month = sep,
year = "2022",
address = "Sofia, Bulgaria",
publisher = "Department of Computational Linguistics, IBL -- BAS",
url = "https://aclanthology.org/2022.clib-1.3/",
pages = "22--38",
abstract = "The vast majority of existing datasets for Named Entity Recognition (NER) are built primarily on news, research papers and Wikipedia with a few exceptions, created from historical and literary texts. What is more, English is the main source for data for further labelling. This paper aims to fill in multiple gaps by creating a novel dataset {\textquotedblleft}Razmecheno{\textquotedblright}, gathered from the diary texts of the project {\textquotedblleft}Prozhito{\textquotedblright} in Russian. Our dataset is of interest for multiple research lines: literary studies of diary texts, transfer learning from other domains, low-resource or cross-lingual named entity recognition. Razmecheno comprises 1331 sentences and 14119 tokens, sampled from diaries, written during the Perestroika. The annotation schema consists of five commonly used entity tags: person, characteristics, location, organisation, and facility. The labelling is carried out on the crowdsourcing platfrom Yandex.Toloka in two stages. First, workers selected sentences, which contain an entity of particular type. Second, they marked up entity spans. As a result 1113 entities were obtained. Empirical evaluation of Razmecheno is carried out with off-the-shelf NER tools and by fine-tuning pre-trained contextualized encoders. We release the annotated dataset for open access."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="atnashev-etal-2022-razmecheno">
<titleInfo>
<title>Razmecheno: Named Entity Recognition from Digital Archive of Diaries “Prozhito”</title>
</titleInfo>
<name type="personal">
<namePart type="given">Timofey</namePart>
<namePart type="family">Atnashev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronika</namePart>
<namePart type="family">Ganeeva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roman</namePart>
<namePart type="family">Kazakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daria</namePart>
<namePart type="family">Matyash</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Sonkin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Voloshina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oleg</namePart>
<namePart type="family">Serikov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Artemova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth International Conference on Computational Linguistics in Bulgaria (CLIB 2022)</title>
</titleInfo>
<originInfo>
<publisher>Department of Computational Linguistics, IBL – BAS</publisher>
<place>
<placeTerm type="text">Sofia, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The vast majority of existing datasets for Named Entity Recognition (NER) are built primarily on news, research papers and Wikipedia with a few exceptions, created from historical and literary texts. What is more, English is the main source for data for further labelling. This paper aims to fill in multiple gaps by creating a novel dataset “Razmecheno”, gathered from the diary texts of the project “Prozhito” in Russian. Our dataset is of interest for multiple research lines: literary studies of diary texts, transfer learning from other domains, low-resource or cross-lingual named entity recognition. Razmecheno comprises 1331 sentences and 14119 tokens, sampled from diaries, written during the Perestroika. The annotation schema consists of five commonly used entity tags: person, characteristics, location, organisation, and facility. The labelling is carried out on the crowdsourcing platfrom Yandex.Toloka in two stages. First, workers selected sentences, which contain an entity of particular type. Second, they marked up entity spans. As a result 1113 entities were obtained. Empirical evaluation of Razmecheno is carried out with off-the-shelf NER tools and by fine-tuning pre-trained contextualized encoders. We release the annotated dataset for open access.</abstract>
<identifier type="citekey">atnashev-etal-2022-razmecheno</identifier>
<location>
<url>https://aclanthology.org/2022.clib-1.3/</url>
</location>
<part>
<date>2022-09</date>
<extent unit="page">
<start>22</start>
<end>38</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Razmecheno: Named Entity Recognition from Digital Archive of Diaries “Prozhito”
%A Atnashev, Timofey
%A Ganeeva, Veronika
%A Kazakov, Roman
%A Matyash, Daria
%A Sonkin, Michael
%A Voloshina, Ekaterina
%A Serikov, Oleg
%A Artemova, Ekaterina
%S Proceedings of the Fifth International Conference on Computational Linguistics in Bulgaria (CLIB 2022)
%D 2022
%8 September
%I Department of Computational Linguistics, IBL – BAS
%C Sofia, Bulgaria
%F atnashev-etal-2022-razmecheno
%X The vast majority of existing datasets for Named Entity Recognition (NER) are built primarily on news, research papers and Wikipedia with a few exceptions, created from historical and literary texts. What is more, English is the main source for data for further labelling. This paper aims to fill in multiple gaps by creating a novel dataset “Razmecheno”, gathered from the diary texts of the project “Prozhito” in Russian. Our dataset is of interest for multiple research lines: literary studies of diary texts, transfer learning from other domains, low-resource or cross-lingual named entity recognition. Razmecheno comprises 1331 sentences and 14119 tokens, sampled from diaries, written during the Perestroika. The annotation schema consists of five commonly used entity tags: person, characteristics, location, organisation, and facility. The labelling is carried out on the crowdsourcing platfrom Yandex.Toloka in two stages. First, workers selected sentences, which contain an entity of particular type. Second, they marked up entity spans. As a result 1113 entities were obtained. Empirical evaluation of Razmecheno is carried out with off-the-shelf NER tools and by fine-tuning pre-trained contextualized encoders. We release the annotated dataset for open access.
%U https://aclanthology.org/2022.clib-1.3/
%P 22-38
Markdown (Informal)
[Razmecheno: Named Entity Recognition from Digital Archive of Diaries “Prozhito”](https://aclanthology.org/2022.clib-1.3/) (Atnashev et al., CLIB 2022)
ACL
- Timofey Atnashev, Veronika Ganeeva, Roman Kazakov, Daria Matyash, Michael Sonkin, Ekaterina Voloshina, Oleg Serikov, and Ekaterina Artemova. 2022. Razmecheno: Named Entity Recognition from Digital Archive of Diaries “Prozhito”. In Proceedings of the Fifth International Conference on Computational Linguistics in Bulgaria (CLIB 2022), pages 22–38, Sofia, Bulgaria. Department of Computational Linguistics, IBL -- BAS.