@inproceedings{hernandez-mena-etal-2024-samromur,
title = "{S}amr{\'o}{M}ur {M}illj{\'o}{N}: An {ASR} Corpus of One Million Verified Read Prompts in {I}celandic",
author = "Hernandez Mena, Carlos Daniel and
Gunnarsson, {\TH}orsteinn Da{\dh}i and
Gudnason, Jon",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.1246/",
pages = "14305--14312",
abstract = "The platform samromur.is, or {\textquotedblleft}Samr{\'o}mur{\textquotedblright} for short, is a crowdsourcing web application built on Mozilla`s Common Voice, designed to accumulate speech data for the advancement of language technologies in Icelandic. Over the years, Samr{\'o}mur has proven to be remarkably successful in amassing a significant number of high-quality audio clips from thousands of users. However, the challenge of manually verifying the entirety of the collected data has hindered its effective exploitation, especially in the realm of Automatic Speech Recognition (ASR), its original purpose. In this paper, we introduce the {\textquotedblleft}Samr{\'o}mur Millj{\'o}n{\textquotedblright} corpus, an ASR dataset comprising one million audio clips from Samr{\'o}mur. These clips have been automatically verified using state-of-the-art speech recognition systems such as NeMo, Wav2Vec2, and Whisper. Additionally, we present the ASR results obtained from creating acoustic models based on Samr{\'o}mur Millj{\'o}n. These results demonstrate significant promise when compared to other acoustic models trained with a similar volume of Icelandic data from different sources."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hernandez-mena-etal-2024-samromur">
<titleInfo>
<title>SamróMur MilljóN: An ASR Corpus of One Million Verified Read Prompts in Icelandic</title>
</titleInfo>
<name type="personal">
<namePart type="given">Carlos</namePart>
<namePart type="given">Daniel</namePart>
<namePart type="family">Hernandez Mena</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">\THorsteinn</namePart>
<namePart type="given">Da\dhi</namePart>
<namePart type="family">Gunnarsson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jon</namePart>
<namePart type="family">Gudnason</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The platform samromur.is, or “Samrómur” for short, is a crowdsourcing web application built on Mozilla‘s Common Voice, designed to accumulate speech data for the advancement of language technologies in Icelandic. Over the years, Samrómur has proven to be remarkably successful in amassing a significant number of high-quality audio clips from thousands of users. However, the challenge of manually verifying the entirety of the collected data has hindered its effective exploitation, especially in the realm of Automatic Speech Recognition (ASR), its original purpose. In this paper, we introduce the “Samrómur Milljón” corpus, an ASR dataset comprising one million audio clips from Samrómur. These clips have been automatically verified using state-of-the-art speech recognition systems such as NeMo, Wav2Vec2, and Whisper. Additionally, we present the ASR results obtained from creating acoustic models based on Samrómur Milljón. These results demonstrate significant promise when compared to other acoustic models trained with a similar volume of Icelandic data from different sources.</abstract>
<identifier type="citekey">hernandez-mena-etal-2024-samromur</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.1246/</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>14305</start>
<end>14312</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SamróMur MilljóN: An ASR Corpus of One Million Verified Read Prompts in Icelandic
%A Hernandez Mena, Carlos Daniel
%A Gunnarsson, \THorsteinn Da\dhi
%A Gudnason, Jon
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F hernandez-mena-etal-2024-samromur
%X The platform samromur.is, or “Samrómur” for short, is a crowdsourcing web application built on Mozilla‘s Common Voice, designed to accumulate speech data for the advancement of language technologies in Icelandic. Over the years, Samrómur has proven to be remarkably successful in amassing a significant number of high-quality audio clips from thousands of users. However, the challenge of manually verifying the entirety of the collected data has hindered its effective exploitation, especially in the realm of Automatic Speech Recognition (ASR), its original purpose. In this paper, we introduce the “Samrómur Milljón” corpus, an ASR dataset comprising one million audio clips from Samrómur. These clips have been automatically verified using state-of-the-art speech recognition systems such as NeMo, Wav2Vec2, and Whisper. Additionally, we present the ASR results obtained from creating acoustic models based on Samrómur Milljón. These results demonstrate significant promise when compared to other acoustic models trained with a similar volume of Icelandic data from different sources.
%U https://aclanthology.org/2024.lrec-main.1246/
%P 14305-14312
Markdown (Informal)
[SamróMur MilljóN: An ASR Corpus of One Million Verified Read Prompts in Icelandic](https://aclanthology.org/2024.lrec-main.1246/) (Hernandez Mena et al., LREC-COLING 2024)
ACL