@inproceedings{abate-etal-2020-large,
title = "Large Vocabulary Read Speech Corpora for Four {E}thiopian Languages: {A}mharic, {T}igrigna, {O}romo, and {W}olaytta",
author = "Abate, Solomon Teferra and
Tachbelie, Martha Yifiru and
Melese, Michael and
Abera, Hafte and
Gebreselassie, Tewodros and
Mulugeta, Wondwossen and
Assabie, Yaregal and
Beyene, Million Meshesha and
Atinafu, Solomon and
Seyoum, Binyam Ephrem",
editor = "Cunha, Rossana and
Shaikh, Samira and
Varis, Erika and
Georgi, Ryan and
Tsai, Alicia and
Anastasopoulos, Antonios and
Chandu, Khyathi Raghavi",
booktitle = "Proceedings of the Fourth Widening Natural Language Processing Workshop",
month = jul,
year = "2020",
address = "Seattle, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.winlp-1.5/",
doi = "10.18653/v1/2020.winlp-1.5",
pages = "13--17",
abstract = "Automatic Speech Recognition (ASR) is one of the most important technologies to help people live a better life in the 21st century. However, its development requires a big speech corpus for a language. The development of such a corpus is expensive especially for under-resourced Ethiopian languages. To address this problem we have developed four medium-sized (longer than 22 hours each) speech corpora for four Ethiopian languages: Amharic, Tigrigna, Oromo, and Wolaytta. In a way of checking the usability of the corpora and deliver a baseline ASR for each language. In this paper, we present the corpora and the baseline ASR systems for each language. The word error rates (WERs) we achieved show that the corpora are usable for further investigation and we recommend the collection of text corpora to train strong language models for Oromo and Wolaytta compared to others."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="abate-etal-2020-large">
<titleInfo>
<title>Large Vocabulary Read Speech Corpora for Four Ethiopian Languages: Amharic, Tigrigna, Oromo, and Wolaytta</title>
</titleInfo>
<name type="personal">
<namePart type="given">Solomon</namePart>
<namePart type="given">Teferra</namePart>
<namePart type="family">Abate</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Martha</namePart>
<namePart type="given">Yifiru</namePart>
<namePart type="family">Tachbelie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Melese</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hafte</namePart>
<namePart type="family">Abera</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tewodros</namePart>
<namePart type="family">Gebreselassie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wondwossen</namePart>
<namePart type="family">Mulugeta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yaregal</namePart>
<namePart type="family">Assabie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Million</namePart>
<namePart type="given">Meshesha</namePart>
<namePart type="family">Beyene</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Solomon</namePart>
<namePart type="family">Atinafu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Binyam</namePart>
<namePart type="given">Ephrem</namePart>
<namePart type="family">Seyoum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fourth Widening Natural Language Processing Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rossana</namePart>
<namePart type="family">Cunha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samira</namePart>
<namePart type="family">Shaikh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Erika</namePart>
<namePart type="family">Varis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="family">Georgi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alicia</namePart>
<namePart type="family">Tsai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonios</namePart>
<namePart type="family">Anastasopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khyathi</namePart>
<namePart type="given">Raghavi</namePart>
<namePart type="family">Chandu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Seattle, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Automatic Speech Recognition (ASR) is one of the most important technologies to help people live a better life in the 21st century. However, its development requires a big speech corpus for a language. The development of such a corpus is expensive especially for under-resourced Ethiopian languages. To address this problem we have developed four medium-sized (longer than 22 hours each) speech corpora for four Ethiopian languages: Amharic, Tigrigna, Oromo, and Wolaytta. In a way of checking the usability of the corpora and deliver a baseline ASR for each language. In this paper, we present the corpora and the baseline ASR systems for each language. The word error rates (WERs) we achieved show that the corpora are usable for further investigation and we recommend the collection of text corpora to train strong language models for Oromo and Wolaytta compared to others.</abstract>
<identifier type="citekey">abate-etal-2020-large</identifier>
<identifier type="doi">10.18653/v1/2020.winlp-1.5</identifier>
<location>
<url>https://aclanthology.org/2020.winlp-1.5/</url>
</location>
<part>
<date>2020-07</date>
<extent unit="page">
<start>13</start>
<end>17</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Large Vocabulary Read Speech Corpora for Four Ethiopian Languages: Amharic, Tigrigna, Oromo, and Wolaytta
%A Abate, Solomon Teferra
%A Tachbelie, Martha Yifiru
%A Melese, Michael
%A Abera, Hafte
%A Gebreselassie, Tewodros
%A Mulugeta, Wondwossen
%A Assabie, Yaregal
%A Beyene, Million Meshesha
%A Atinafu, Solomon
%A Seyoum, Binyam Ephrem
%Y Cunha, Rossana
%Y Shaikh, Samira
%Y Varis, Erika
%Y Georgi, Ryan
%Y Tsai, Alicia
%Y Anastasopoulos, Antonios
%Y Chandu, Khyathi Raghavi
%S Proceedings of the Fourth Widening Natural Language Processing Workshop
%D 2020
%8 July
%I Association for Computational Linguistics
%C Seattle, USA
%F abate-etal-2020-large
%X Automatic Speech Recognition (ASR) is one of the most important technologies to help people live a better life in the 21st century. However, its development requires a big speech corpus for a language. The development of such a corpus is expensive especially for under-resourced Ethiopian languages. To address this problem we have developed four medium-sized (longer than 22 hours each) speech corpora for four Ethiopian languages: Amharic, Tigrigna, Oromo, and Wolaytta. In a way of checking the usability of the corpora and deliver a baseline ASR for each language. In this paper, we present the corpora and the baseline ASR systems for each language. The word error rates (WERs) we achieved show that the corpora are usable for further investigation and we recommend the collection of text corpora to train strong language models for Oromo and Wolaytta compared to others.
%R 10.18653/v1/2020.winlp-1.5
%U https://aclanthology.org/2020.winlp-1.5/
%U https://doi.org/10.18653/v1/2020.winlp-1.5
%P 13-17
Markdown (Informal)
[Large Vocabulary Read Speech Corpora for Four Ethiopian Languages: Amharic, Tigrigna, Oromo, and Wolaytta](https://aclanthology.org/2020.winlp-1.5/) (Abate et al., WiNLP 2020)
ACL
- Solomon Teferra Abate, Martha Yifiru Tachbelie, Michael Melese, Hafte Abera, Tewodros Gebreselassie, Wondwossen Mulugeta, Yaregal Assabie, Million Meshesha Beyene, Solomon Atinafu, and Binyam Ephrem Seyoum. 2020. Large Vocabulary Read Speech Corpora for Four Ethiopian Languages: Amharic, Tigrigna, Oromo, and Wolaytta. In Proceedings of the Fourth Widening Natural Language Processing Workshop, pages 13–17, Seattle, USA. Association for Computational Linguistics.