@inproceedings{romanyshyn-etal-2023-learning,
title = "Learning Word Embeddings for {U}krainian: A Comparative Study of {F}ast{T}ext Hyperparameters",
author = "Romanyshyn, Nataliia and
Chaplynskyi, Dmytro and
Zakharov, Kyrylo",
editor = "Romanyshyn, Mariana",
booktitle = "Proceedings of the Second Ukrainian Natural Language Processing Workshop (UNLP)",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.unlp-1.3",
doi = "10.18653/v1/2023.unlp-1.3",
pages = "20--31",
abstract = "This study addresses the challenges of learning unsupervised word representations for the morphologically rich and low-resource Ukrainian language. Traditional models that perform decently on English do not generalize well for such languages due to a lack of sufficient data and the complexity of their grammatical structures. To overcome these challenges, we utilized a high-quality, large dataset of different genres for learning Ukrainian word vector representations. We found the best hyperparameters to train fastText language models on this dataset and performed intrinsic and extrinsic evaluations of the generated word embeddings using the established methods and metrics. The results of this study indicate that the trained vectors exhibit superior performance on intrinsic tests in comparison to existing embeddings for Ukrainian. Our best model gives 62{\%} Accuracy on the word analogy task. Extrinsic evaluations were performed on two sequence labeling tasks: NER and POS tagging (83{\%} spaCy NER F-score, 83{\%} spaCy POS Accuracy, 92{\%} Flair POS Accuracy).",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="romanyshyn-etal-2023-learning">
<titleInfo>
<title>Learning Word Embeddings for Ukrainian: A Comparative Study of FastText Hyperparameters</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nataliia</namePart>
<namePart type="family">Romanyshyn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dmytro</namePart>
<namePart type="family">Chaplynskyi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kyrylo</namePart>
<namePart type="family">Zakharov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Ukrainian Natural Language Processing Workshop (UNLP)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mariana</namePart>
<namePart type="family">Romanyshyn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dubrovnik, Croatia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This study addresses the challenges of learning unsupervised word representations for the morphologically rich and low-resource Ukrainian language. Traditional models that perform decently on English do not generalize well for such languages due to a lack of sufficient data and the complexity of their grammatical structures. To overcome these challenges, we utilized a high-quality, large dataset of different genres for learning Ukrainian word vector representations. We found the best hyperparameters to train fastText language models on this dataset and performed intrinsic and extrinsic evaluations of the generated word embeddings using the established methods and metrics. The results of this study indicate that the trained vectors exhibit superior performance on intrinsic tests in comparison to existing embeddings for Ukrainian. Our best model gives 62% Accuracy on the word analogy task. Extrinsic evaluations were performed on two sequence labeling tasks: NER and POS tagging (83% spaCy NER F-score, 83% spaCy POS Accuracy, 92% Flair POS Accuracy).</abstract>
<identifier type="citekey">romanyshyn-etal-2023-learning</identifier>
<identifier type="doi">10.18653/v1/2023.unlp-1.3</identifier>
<location>
<url>https://aclanthology.org/2023.unlp-1.3</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>20</start>
<end>31</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Learning Word Embeddings for Ukrainian: A Comparative Study of FastText Hyperparameters
%A Romanyshyn, Nataliia
%A Chaplynskyi, Dmytro
%A Zakharov, Kyrylo
%Y Romanyshyn, Mariana
%S Proceedings of the Second Ukrainian Natural Language Processing Workshop (UNLP)
%D 2023
%8 May
%I Association for Computational Linguistics
%C Dubrovnik, Croatia
%F romanyshyn-etal-2023-learning
%X This study addresses the challenges of learning unsupervised word representations for the morphologically rich and low-resource Ukrainian language. Traditional models that perform decently on English do not generalize well for such languages due to a lack of sufficient data and the complexity of their grammatical structures. To overcome these challenges, we utilized a high-quality, large dataset of different genres for learning Ukrainian word vector representations. We found the best hyperparameters to train fastText language models on this dataset and performed intrinsic and extrinsic evaluations of the generated word embeddings using the established methods and metrics. The results of this study indicate that the trained vectors exhibit superior performance on intrinsic tests in comparison to existing embeddings for Ukrainian. Our best model gives 62% Accuracy on the word analogy task. Extrinsic evaluations were performed on two sequence labeling tasks: NER and POS tagging (83% spaCy NER F-score, 83% spaCy POS Accuracy, 92% Flair POS Accuracy).
%R 10.18653/v1/2023.unlp-1.3
%U https://aclanthology.org/2023.unlp-1.3
%U https://doi.org/10.18653/v1/2023.unlp-1.3
%P 20-31
Markdown (Informal)
[Learning Word Embeddings for Ukrainian: A Comparative Study of FastText Hyperparameters](https://aclanthology.org/2023.unlp-1.3) (Romanyshyn et al., UNLP 2023)
ACL