@inproceedings{ogunremi-etal-2024-iroyinspeech,
title = "{{\`I}}r{\`o}y{\`i}n{S}peech: A Multi-purpose {Y}or{\`u}b{\'a} Speech Corpus",
author = "Ogunremi, Tolulope and
Tubosun, Kola and
Aremu, Anuoluwapo and
Orife, Iroro and
Adelani, David Ifeoluwa",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.812/",
pages = "9296--9303",
abstract = "We introduce {\`I}r{\`o}y{\`i}nSpeech corpus{---}a new dataset influenced by a desire to increase the amount of high quality, freely available, contemporary Yor{\`u}b{\'a} speech data that can be used for both Text-to-Speech (TTS) and Automatic Speech Recognition (ASR) tasks. We curated about 23,000 text sentences from the news and creative writing domains with an open license i.e., CC-BY-4.0 and asked multiple speakers to record each sentence. To encourage more participatory approach to data creation, we provide 5 000 utterances from the curated sentences to the Mozilla Common Voice platform to crowd-source the recording and validation of Yor{\`u}b{\'a} speech data. In total, we created about 42 hours of speech data recorded by 80 volunteers in-house, and 6 hours validated recordings on Mozilla Common Voice platform. Our evaluation on TTS shows that we can create a good quality general domain single-speaker TTS model for Yor{\`u}b{\'a} with as little 5 hours of speech by leveraging an end-to-end VITS architecture. Similarly, for ASR, we obtained a WER of 21.5."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ogunremi-etal-2024-iroyinspeech">
<titleInfo>
<title>ÌròyìnSpeech: A Multi-purpose Yorùbá Speech Corpus</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tolulope</namePart>
<namePart type="family">Ogunremi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kola</namePart>
<namePart type="family">Tubosun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anuoluwapo</namePart>
<namePart type="family">Aremu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iroro</namePart>
<namePart type="family">Orife</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="given">Ifeoluwa</namePart>
<namePart type="family">Adelani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We introduce ÌròyìnSpeech corpus—a new dataset influenced by a desire to increase the amount of high quality, freely available, contemporary Yorùbá speech data that can be used for both Text-to-Speech (TTS) and Automatic Speech Recognition (ASR) tasks. We curated about 23,000 text sentences from the news and creative writing domains with an open license i.e., CC-BY-4.0 and asked multiple speakers to record each sentence. To encourage more participatory approach to data creation, we provide 5 000 utterances from the curated sentences to the Mozilla Common Voice platform to crowd-source the recording and validation of Yorùbá speech data. In total, we created about 42 hours of speech data recorded by 80 volunteers in-house, and 6 hours validated recordings on Mozilla Common Voice platform. Our evaluation on TTS shows that we can create a good quality general domain single-speaker TTS model for Yorùbá with as little 5 hours of speech by leveraging an end-to-end VITS architecture. Similarly, for ASR, we obtained a WER of 21.5.</abstract>
<identifier type="citekey">ogunremi-etal-2024-iroyinspeech</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.812/</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>9296</start>
<end>9303</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ÌròyìnSpeech: A Multi-purpose Yorùbá Speech Corpus
%A Ogunremi, Tolulope
%A Tubosun, Kola
%A Aremu, Anuoluwapo
%A Orife, Iroro
%A Adelani, David Ifeoluwa
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F ogunremi-etal-2024-iroyinspeech
%X We introduce ÌròyìnSpeech corpus—a new dataset influenced by a desire to increase the amount of high quality, freely available, contemporary Yorùbá speech data that can be used for both Text-to-Speech (TTS) and Automatic Speech Recognition (ASR) tasks. We curated about 23,000 text sentences from the news and creative writing domains with an open license i.e., CC-BY-4.0 and asked multiple speakers to record each sentence. To encourage more participatory approach to data creation, we provide 5 000 utterances from the curated sentences to the Mozilla Common Voice platform to crowd-source the recording and validation of Yorùbá speech data. In total, we created about 42 hours of speech data recorded by 80 volunteers in-house, and 6 hours validated recordings on Mozilla Common Voice platform. Our evaluation on TTS shows that we can create a good quality general domain single-speaker TTS model for Yorùbá with as little 5 hours of speech by leveraging an end-to-end VITS architecture. Similarly, for ASR, we obtained a WER of 21.5.
%U https://aclanthology.org/2024.lrec-main.812/
%P 9296-9303
Markdown (Informal)
[ÌròyìnSpeech: A Multi-purpose Yorùbá Speech Corpus](https://aclanthology.org/2024.lrec-main.812/) (Ogunremi et al., LREC-COLING 2024)
ACL
- Tolulope Ogunremi, Kola Tubosun, Anuoluwapo Aremu, Iroro Orife, and David Ifeoluwa Adelani. 2024. ÌròyìnSpeech: A Multi-purpose Yorùbá Speech Corpus. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), pages 9296–9303, Torino, Italia. ELRA and ICCL.