@inproceedings{krause-etal-2021-bootstrapping,
title = "Bootstrapping Multilingual Metadata Extraction: A Showcase in {C}yrillic",
author = {Krause, Johan and
Shapiro, Igor and
Saier, Tarek and
F{\"a}rber, Michael},
editor = "Beltagy, Iz and
Cohan, Arman and
Feigenblat, Guy and
Freitag, Dayne and
Ghosal, Tirthankar and
Hall, Keith and
Herrmannova, Drahomira and
Knoth, Petr and
Lo, Kyle and
Mayr, Philipp and
Patton, Robert M. and
Shmueli-Scheuer, Michal and
de Waard, Anita and
Wang, Kuansan and
Wang, Lucy Lu",
booktitle = "Proceedings of the Second Workshop on Scholarly Document Processing",
month = jun,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.sdp-1.8/",
doi = "10.18653/v1/2021.sdp-1.8",
pages = "66--72",
abstract = "Applications based on scholarly data are of ever increasing importance. This results in disadvantages for areas where high-quality data and compatible systems are not available, such as non-English publications. To advance the mitigation of this imbalance, we use Cyrillic script publications from the CORE collection to create a high-quality data set for metadata extraction. We utilize our data for training and evaluating sequence labeling models to extract title and author information. Retraining GROBID on our data, we observe significant improvements in terms of precision and recall and achieve even better results with a self developed model. We make our data set covering over 15,000 publications as well as our source code freely available."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="krause-etal-2021-bootstrapping">
<titleInfo>
<title>Bootstrapping Multilingual Metadata Extraction: A Showcase in Cyrillic</title>
</titleInfo>
<name type="personal">
<namePart type="given">Johan</namePart>
<namePart type="family">Krause</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Igor</namePart>
<namePart type="family">Shapiro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tarek</namePart>
<namePart type="family">Saier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Färber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on Scholarly Document Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Iz</namePart>
<namePart type="family">Beltagy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arman</namePart>
<namePart type="family">Cohan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guy</namePart>
<namePart type="family">Feigenblat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dayne</namePart>
<namePart type="family">Freitag</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tirthankar</namePart>
<namePart type="family">Ghosal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Keith</namePart>
<namePart type="family">Hall</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Drahomira</namePart>
<namePart type="family">Herrmannova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Petr</namePart>
<namePart type="family">Knoth</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kyle</namePart>
<namePart type="family">Lo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Mayr</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Robert</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Patton</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michal</namePart>
<namePart type="family">Shmueli-Scheuer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anita</namePart>
<namePart type="family">de Waard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kuansan</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucy</namePart>
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Applications based on scholarly data are of ever increasing importance. This results in disadvantages for areas where high-quality data and compatible systems are not available, such as non-English publications. To advance the mitigation of this imbalance, we use Cyrillic script publications from the CORE collection to create a high-quality data set for metadata extraction. We utilize our data for training and evaluating sequence labeling models to extract title and author information. Retraining GROBID on our data, we observe significant improvements in terms of precision and recall and achieve even better results with a self developed model. We make our data set covering over 15,000 publications as well as our source code freely available.</abstract>
<identifier type="citekey">krause-etal-2021-bootstrapping</identifier>
<identifier type="doi">10.18653/v1/2021.sdp-1.8</identifier>
<location>
<url>https://aclanthology.org/2021.sdp-1.8/</url>
</location>
<part>
<date>2021-06</date>
<extent unit="page">
<start>66</start>
<end>72</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Bootstrapping Multilingual Metadata Extraction: A Showcase in Cyrillic
%A Krause, Johan
%A Shapiro, Igor
%A Saier, Tarek
%A Färber, Michael
%Y Beltagy, Iz
%Y Cohan, Arman
%Y Feigenblat, Guy
%Y Freitag, Dayne
%Y Ghosal, Tirthankar
%Y Hall, Keith
%Y Herrmannova, Drahomira
%Y Knoth, Petr
%Y Lo, Kyle
%Y Mayr, Philipp
%Y Patton, Robert M.
%Y Shmueli-Scheuer, Michal
%Y de Waard, Anita
%Y Wang, Kuansan
%Y Wang, Lucy Lu
%S Proceedings of the Second Workshop on Scholarly Document Processing
%D 2021
%8 June
%I Association for Computational Linguistics
%C Online
%F krause-etal-2021-bootstrapping
%X Applications based on scholarly data are of ever increasing importance. This results in disadvantages for areas where high-quality data and compatible systems are not available, such as non-English publications. To advance the mitigation of this imbalance, we use Cyrillic script publications from the CORE collection to create a high-quality data set for metadata extraction. We utilize our data for training and evaluating sequence labeling models to extract title and author information. Retraining GROBID on our data, we observe significant improvements in terms of precision and recall and achieve even better results with a self developed model. We make our data set covering over 15,000 publications as well as our source code freely available.
%R 10.18653/v1/2021.sdp-1.8
%U https://aclanthology.org/2021.sdp-1.8/
%U https://doi.org/10.18653/v1/2021.sdp-1.8
%P 66-72
Markdown (Informal)
[Bootstrapping Multilingual Metadata Extraction: A Showcase in Cyrillic](https://aclanthology.org/2021.sdp-1.8/) (Krause et al., sdp 2021)
ACL