@inproceedings{arhar-holdt-etal-2024-suk,
title = "{SUK} 1.0: A New Training Corpus for Linguistic Annotation of Modern Standard {S}lovene",
author = "Arhar Holdt, {\v{S}}pela and
{\v{C}}ibej, Jaka and
Dobrovoljc, Kaja and
Erjavec, Toma{\v{z}} and
Gantar, Polona and
Krek, Simon and
Munda, Tina and
Robida, Nejc and
Ter{\v{c}}on, Luka and
Zitnik, Slavko",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.1340",
pages = "15428--15435",
abstract = "This paper introduces the upgrade of a training corpus for linguistic annotation of modern standard Slovene. The enhancement spans both the size of the corpus and the depth of annotation layers. The revised SUK 1.0 corpus, building on its predecessor ssj500k 2.3, has doubled in size, containing over a million tokens. This expansion integrates three preexisting open-access datasets, all of which have undergone automatic tagging and meticulous manual review across multiple annotation layers, each represented in varying proportions. These layers span tokenization, segmentation, lemmatization, MULTEXT-East morphology, Universal Dependencies, JOS-SYN syntax, semantic role labeling, named entity recognition, and the newly incorporated coreferences. The paper illustrates the annotation processes for each layer while also presenting the results of the new CLASSLA-Stanza annotation tool, trained on the SUK corpus data. As one of the fundamental language resources of modern Slovene, the SUK corpus calls for constant development, as outlined in the concluding section.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="arhar-holdt-etal-2024-suk">
<titleInfo>
<title>SUK 1.0: A New Training Corpus for Linguistic Annotation of Modern Standard Slovene</title>
</titleInfo>
<name type="personal">
<namePart type="given">Špela</namePart>
<namePart type="family">Arhar Holdt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jaka</namePart>
<namePart type="family">Čibej</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaja</namePart>
<namePart type="family">Dobrovoljc</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tomaž</namePart>
<namePart type="family">Erjavec</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Polona</namePart>
<namePart type="family">Gantar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Krek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tina</namePart>
<namePart type="family">Munda</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nejc</namePart>
<namePart type="family">Robida</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luka</namePart>
<namePart type="family">Terčon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Slavko</namePart>
<namePart type="family">Zitnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper introduces the upgrade of a training corpus for linguistic annotation of modern standard Slovene. The enhancement spans both the size of the corpus and the depth of annotation layers. The revised SUK 1.0 corpus, building on its predecessor ssj500k 2.3, has doubled in size, containing over a million tokens. This expansion integrates three preexisting open-access datasets, all of which have undergone automatic tagging and meticulous manual review across multiple annotation layers, each represented in varying proportions. These layers span tokenization, segmentation, lemmatization, MULTEXT-East morphology, Universal Dependencies, JOS-SYN syntax, semantic role labeling, named entity recognition, and the newly incorporated coreferences. The paper illustrates the annotation processes for each layer while also presenting the results of the new CLASSLA-Stanza annotation tool, trained on the SUK corpus data. As one of the fundamental language resources of modern Slovene, the SUK corpus calls for constant development, as outlined in the concluding section.</abstract>
<identifier type="citekey">arhar-holdt-etal-2024-suk</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.1340</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>15428</start>
<end>15435</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SUK 1.0: A New Training Corpus for Linguistic Annotation of Modern Standard Slovene
%A Arhar Holdt, Špela
%A Čibej, Jaka
%A Dobrovoljc, Kaja
%A Erjavec, Tomaž
%A Gantar, Polona
%A Krek, Simon
%A Munda, Tina
%A Robida, Nejc
%A Terčon, Luka
%A Zitnik, Slavko
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F arhar-holdt-etal-2024-suk
%X This paper introduces the upgrade of a training corpus for linguistic annotation of modern standard Slovene. The enhancement spans both the size of the corpus and the depth of annotation layers. The revised SUK 1.0 corpus, building on its predecessor ssj500k 2.3, has doubled in size, containing over a million tokens. This expansion integrates three preexisting open-access datasets, all of which have undergone automatic tagging and meticulous manual review across multiple annotation layers, each represented in varying proportions. These layers span tokenization, segmentation, lemmatization, MULTEXT-East morphology, Universal Dependencies, JOS-SYN syntax, semantic role labeling, named entity recognition, and the newly incorporated coreferences. The paper illustrates the annotation processes for each layer while also presenting the results of the new CLASSLA-Stanza annotation tool, trained on the SUK corpus data. As one of the fundamental language resources of modern Slovene, the SUK corpus calls for constant development, as outlined in the concluding section.
%U https://aclanthology.org/2024.lrec-main.1340
%P 15428-15435
Markdown (Informal)
[SUK 1.0: A New Training Corpus for Linguistic Annotation of Modern Standard Slovene](https://aclanthology.org/2024.lrec-main.1340) (Arhar Holdt et al., LREC-COLING 2024)
ACL
- Špela Arhar Holdt, Jaka Čibej, Kaja Dobrovoljc, Tomaž Erjavec, Polona Gantar, Simon Krek, Tina Munda, Nejc Robida, Luka Terčon, and Slavko Zitnik. 2024. SUK 1.0: A New Training Corpus for Linguistic Annotation of Modern Standard Slovene. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), pages 15428–15435, Torino, Italia. ELRA and ICCL.