@inproceedings{prelevikj-zitnik-2021-multilingual,
title = "Multilingual Named Entity Recognition and Matching Using {BERT} and Dedupe for {S}lavic Languages",
author = "Prelevikj, Marko and
Zitnik, Slavko",
editor = "Babych, Bogdan and
Kanishcheva, Olga and
Nakov, Preslav and
Piskorski, Jakub and
Pivovarova, Lidia and
Starko, Vasyl and
Steinberger, Josef and
Yangarber, Roman and
Marci{\'n}czuk, Micha{\l} and
Pollak, Senja and
P{\v{r}}ib{\'a}{\v{n}}, Pavel and
Robnik-{\v{S}}ikonja, Marko",
booktitle = "Proceedings of the 8th Workshop on Balto-Slavic Natural Language Processing",
month = apr,
year = "2021",
address = "Kiyv, Ukraine",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.bsnlp-1.9/",
pages = "80--85",
abstract = "This paper describes the University of Ljubljana (UL FRI) Group`s submissions to the shared task at the Balto-Slavic Natural Language Processing (BSNLP) 2021 Workshop. We experiment with multiple BERT-based models, pre-trained in multi-lingual, Croatian-Slovene-English and Slovene-only data. We perform training iteratively and on the concatenated data of previously available NER datasets. For the normalization task we use Stanza lemmatizer, while for entity matching we implemented a baseline using the Dedupe library. The performance of evaluations suggests that multi-source settings outperform less-resourced approaches. The best NER models achieve 0.91 F-score on Slovene training data splits while the best official submission achieved F-scores of 0.84 and 0.78 for relaxed partial matching and strict settings, respectively. In multi-lingual NER setting we achieve F-scores of 0.82 and 0.74."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="prelevikj-zitnik-2021-multilingual">
<titleInfo>
<title>Multilingual Named Entity Recognition and Matching Using BERT and Dedupe for Slavic Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marko</namePart>
<namePart type="family">Prelevikj</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Slavko</namePart>
<namePart type="family">Zitnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 8th Workshop on Balto-Slavic Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bogdan</namePart>
<namePart type="family">Babych</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Olga</namePart>
<namePart type="family">Kanishcheva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jakub</namePart>
<namePart type="family">Piskorski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lidia</namePart>
<namePart type="family">Pivovarova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vasyl</namePart>
<namePart type="family">Starko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Josef</namePart>
<namePart type="family">Steinberger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roman</namePart>
<namePart type="family">Yangarber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michał</namePart>
<namePart type="family">Marcińczuk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Senja</namePart>
<namePart type="family">Pollak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pavel</namePart>
<namePart type="family">Přibáň</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marko</namePart>
<namePart type="family">Robnik-Šikonja</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Kiyv, Ukraine</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper describes the University of Ljubljana (UL FRI) Group‘s submissions to the shared task at the Balto-Slavic Natural Language Processing (BSNLP) 2021 Workshop. We experiment with multiple BERT-based models, pre-trained in multi-lingual, Croatian-Slovene-English and Slovene-only data. We perform training iteratively and on the concatenated data of previously available NER datasets. For the normalization task we use Stanza lemmatizer, while for entity matching we implemented a baseline using the Dedupe library. The performance of evaluations suggests that multi-source settings outperform less-resourced approaches. The best NER models achieve 0.91 F-score on Slovene training data splits while the best official submission achieved F-scores of 0.84 and 0.78 for relaxed partial matching and strict settings, respectively. In multi-lingual NER setting we achieve F-scores of 0.82 and 0.74.</abstract>
<identifier type="citekey">prelevikj-zitnik-2021-multilingual</identifier>
<location>
<url>https://aclanthology.org/2021.bsnlp-1.9/</url>
</location>
<part>
<date>2021-04</date>
<extent unit="page">
<start>80</start>
<end>85</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multilingual Named Entity Recognition and Matching Using BERT and Dedupe for Slavic Languages
%A Prelevikj, Marko
%A Zitnik, Slavko
%Y Babych, Bogdan
%Y Kanishcheva, Olga
%Y Nakov, Preslav
%Y Piskorski, Jakub
%Y Pivovarova, Lidia
%Y Starko, Vasyl
%Y Steinberger, Josef
%Y Yangarber, Roman
%Y Marcińczuk, Michał
%Y Pollak, Senja
%Y Přibáň, Pavel
%Y Robnik-Šikonja, Marko
%S Proceedings of the 8th Workshop on Balto-Slavic Natural Language Processing
%D 2021
%8 April
%I Association for Computational Linguistics
%C Kiyv, Ukraine
%F prelevikj-zitnik-2021-multilingual
%X This paper describes the University of Ljubljana (UL FRI) Group‘s submissions to the shared task at the Balto-Slavic Natural Language Processing (BSNLP) 2021 Workshop. We experiment with multiple BERT-based models, pre-trained in multi-lingual, Croatian-Slovene-English and Slovene-only data. We perform training iteratively and on the concatenated data of previously available NER datasets. For the normalization task we use Stanza lemmatizer, while for entity matching we implemented a baseline using the Dedupe library. The performance of evaluations suggests that multi-source settings outperform less-resourced approaches. The best NER models achieve 0.91 F-score on Slovene training data splits while the best official submission achieved F-scores of 0.84 and 0.78 for relaxed partial matching and strict settings, respectively. In multi-lingual NER setting we achieve F-scores of 0.82 and 0.74.
%U https://aclanthology.org/2021.bsnlp-1.9/
%P 80-85
Markdown (Informal)
[Multilingual Named Entity Recognition and Matching Using BERT and Dedupe for Slavic Languages](https://aclanthology.org/2021.bsnlp-1.9/) (Prelevikj & Zitnik, BSNLP 2021)
ACL