@inproceedings{destaw-etal-2021-development,
title = "The Development of Pre-processing Tools and Pre-trained Embedding Models for {A}mharic",
author = "Belay, Tadesse Destaw and
Ayele, Abinew and
Yimam, Seid Muhie",
editor = "Varis, Erika and
Georgi, Ryan and
Tsai, Alicia and
Anastasopoulos, Antonios and
Chandu, Kyathi and
Schofield, Xanda and
Ranathunga, Surangika and
Lepp, Haley and
Ghosal, Tirthankar",
booktitle = "Proceedings of the Fifth Workshop on Widening Natural Language Processing",
month = nov,
year = "2021",
address = "Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.winlp-1.5/",
pages = "25--28",
abstract = "Amharic is the second most spoken Semitic language after Arabic and serves as the official working language of Ethiopia. While Amharic NLP research is getting wider attention recently, the main bottleneck is that the resources and related tools are not publicly released, which makes it still a low-resource language. Due to this reason, we observe that different researchers try to repeat the same NLP research again and again. In this work, we investigate the existing approach in Amharic NLP and take the first step to publicly release tools, datasets, and models to advance Amharic NLP research. We build Python-based preprocessing tools for Amharic (tokenizer, sentence segmenter, and text cleaner) that can easily be used and integrated for the development of NLP applications. Furthermore, we compiled the first moderately large-scale Amharic text corpus (6.8m sentences) along with the word2Vec, fastText, RoBERTa, and FLAIR embeddings models. Finally, we compile benchmark datasets and build classification models for the named entity recognition task."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="destaw-etal-2021-development">
<titleInfo>
<title>The Development of Pre-processing Tools and Pre-trained Embedding Models for Amharic</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tadesse</namePart>
<namePart type="given">Destaw</namePart>
<namePart type="family">Belay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abinew</namePart>
<namePart type="family">Ayele</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seid</namePart>
<namePart type="given">Muhie</namePart>
<namePart type="family">Yimam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Workshop on Widening Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Erika</namePart>
<namePart type="family">Varis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="family">Georgi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alicia</namePart>
<namePart type="family">Tsai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonios</namePart>
<namePart type="family">Anastasopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kyathi</namePart>
<namePart type="family">Chandu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xanda</namePart>
<namePart type="family">Schofield</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Surangika</namePart>
<namePart type="family">Ranathunga</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haley</namePart>
<namePart type="family">Lepp</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tirthankar</namePart>
<namePart type="family">Ghosal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Punta Cana, Dominican Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Amharic is the second most spoken Semitic language after Arabic and serves as the official working language of Ethiopia. While Amharic NLP research is getting wider attention recently, the main bottleneck is that the resources and related tools are not publicly released, which makes it still a low-resource language. Due to this reason, we observe that different researchers try to repeat the same NLP research again and again. In this work, we investigate the existing approach in Amharic NLP and take the first step to publicly release tools, datasets, and models to advance Amharic NLP research. We build Python-based preprocessing tools for Amharic (tokenizer, sentence segmenter, and text cleaner) that can easily be used and integrated for the development of NLP applications. Furthermore, we compiled the first moderately large-scale Amharic text corpus (6.8m sentences) along with the word2Vec, fastText, RoBERTa, and FLAIR embeddings models. Finally, we compile benchmark datasets and build classification models for the named entity recognition task.</abstract>
<identifier type="citekey">destaw-etal-2021-development</identifier>
<location>
<url>https://aclanthology.org/2021.winlp-1.5/</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>25</start>
<end>28</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Development of Pre-processing Tools and Pre-trained Embedding Models for Amharic
%A Belay, Tadesse Destaw
%A Ayele, Abinew
%A Yimam, Seid Muhie
%Y Varis, Erika
%Y Georgi, Ryan
%Y Tsai, Alicia
%Y Anastasopoulos, Antonios
%Y Chandu, Kyathi
%Y Schofield, Xanda
%Y Ranathunga, Surangika
%Y Lepp, Haley
%Y Ghosal, Tirthankar
%S Proceedings of the Fifth Workshop on Widening Natural Language Processing
%D 2021
%8 November
%I Association for Computational Linguistics
%C Punta Cana, Dominican Republic
%F destaw-etal-2021-development
%X Amharic is the second most spoken Semitic language after Arabic and serves as the official working language of Ethiopia. While Amharic NLP research is getting wider attention recently, the main bottleneck is that the resources and related tools are not publicly released, which makes it still a low-resource language. Due to this reason, we observe that different researchers try to repeat the same NLP research again and again. In this work, we investigate the existing approach in Amharic NLP and take the first step to publicly release tools, datasets, and models to advance Amharic NLP research. We build Python-based preprocessing tools for Amharic (tokenizer, sentence segmenter, and text cleaner) that can easily be used and integrated for the development of NLP applications. Furthermore, we compiled the first moderately large-scale Amharic text corpus (6.8m sentences) along with the word2Vec, fastText, RoBERTa, and FLAIR embeddings models. Finally, we compile benchmark datasets and build classification models for the named entity recognition task.
%U https://aclanthology.org/2021.winlp-1.5/
%P 25-28
Markdown (Informal)
[The Development of Pre-processing Tools and Pre-trained Embedding Models for Amharic](https://aclanthology.org/2021.winlp-1.5/) (Belay et al., WiNLP 2021)
ACL