@inproceedings{ali-etal-2024-detecting-loanwords,
title = "Detecting Loanwords in Emakhuwa: An Extremely Low-Resource {B}antu Language Exhibiting Significant Borrowing from {P}ortuguese",
author = "Ali, Felermino Dario Mario and
Lopes Cardoso, Henrique and
Sousa-Silva, Rui",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.425",
pages = "4750--4759",
abstract = "The accurate identification of loanwords within a given text holds significant potential as a valuable tool for addressing data augmentation and mitigating data sparsity issues. Such identification can improve the performance of various natural language processing tasks, particularly in the context of low-resource languages that lack standardized spelling conventions.This research proposes a supervised method to identify loanwords in Emakhuwa, borrowed from Portuguese. Our methodology encompasses a two-fold approach. Firstly, we employ traditional machine learning algorithms incorporating handcrafted features, including language-specific and similarity-based features. We build upon prior studies to extract similarity features and propose utilizing two external resources: a Sequence-to-Sequence model and a dictionary. This innovative approach allows us to identify loanwords solely by analyzing the target word without prior knowledge about its donor counterpart. Furthermore, we fine-tune the pre-trained CANINE model for the downstream task of loanword detection, which culminates in the impressive achievement of the F1-score of 93{\%}. To the best of our knowledge, this study is the first of its kind focusing on Emakhuwa, and the preliminary results are promising as they pave the way to further advancements.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ali-etal-2024-detecting-loanwords">
<titleInfo>
<title>Detecting Loanwords in Emakhuwa: An Extremely Low-Resource Bantu Language Exhibiting Significant Borrowing from Portuguese</title>
</titleInfo>
<name type="personal">
<namePart type="given">Felermino</namePart>
<namePart type="given">Dario</namePart>
<namePart type="given">Mario</namePart>
<namePart type="family">Ali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Henrique</namePart>
<namePart type="family">Lopes Cardoso</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rui</namePart>
<namePart type="family">Sousa-Silva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The accurate identification of loanwords within a given text holds significant potential as a valuable tool for addressing data augmentation and mitigating data sparsity issues. Such identification can improve the performance of various natural language processing tasks, particularly in the context of low-resource languages that lack standardized spelling conventions.This research proposes a supervised method to identify loanwords in Emakhuwa, borrowed from Portuguese. Our methodology encompasses a two-fold approach. Firstly, we employ traditional machine learning algorithms incorporating handcrafted features, including language-specific and similarity-based features. We build upon prior studies to extract similarity features and propose utilizing two external resources: a Sequence-to-Sequence model and a dictionary. This innovative approach allows us to identify loanwords solely by analyzing the target word without prior knowledge about its donor counterpart. Furthermore, we fine-tune the pre-trained CANINE model for the downstream task of loanword detection, which culminates in the impressive achievement of the F1-score of 93%. To the best of our knowledge, this study is the first of its kind focusing on Emakhuwa, and the preliminary results are promising as they pave the way to further advancements.</abstract>
<identifier type="citekey">ali-etal-2024-detecting-loanwords</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.425</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>4750</start>
<end>4759</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Detecting Loanwords in Emakhuwa: An Extremely Low-Resource Bantu Language Exhibiting Significant Borrowing from Portuguese
%A Ali, Felermino Dario Mario
%A Lopes Cardoso, Henrique
%A Sousa-Silva, Rui
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F ali-etal-2024-detecting-loanwords
%X The accurate identification of loanwords within a given text holds significant potential as a valuable tool for addressing data augmentation and mitigating data sparsity issues. Such identification can improve the performance of various natural language processing tasks, particularly in the context of low-resource languages that lack standardized spelling conventions.This research proposes a supervised method to identify loanwords in Emakhuwa, borrowed from Portuguese. Our methodology encompasses a two-fold approach. Firstly, we employ traditional machine learning algorithms incorporating handcrafted features, including language-specific and similarity-based features. We build upon prior studies to extract similarity features and propose utilizing two external resources: a Sequence-to-Sequence model and a dictionary. This innovative approach allows us to identify loanwords solely by analyzing the target word without prior knowledge about its donor counterpart. Furthermore, we fine-tune the pre-trained CANINE model for the downstream task of loanword detection, which culminates in the impressive achievement of the F1-score of 93%. To the best of our knowledge, this study is the first of its kind focusing on Emakhuwa, and the preliminary results are promising as they pave the way to further advancements.
%U https://aclanthology.org/2024.lrec-main.425
%P 4750-4759
Markdown (Informal)
[Detecting Loanwords in Emakhuwa: An Extremely Low-Resource Bantu Language Exhibiting Significant Borrowing from Portuguese](https://aclanthology.org/2024.lrec-main.425) (Ali et al., LREC-COLING 2024)
ACL