@inproceedings{parikh-etal-2024-ensembles,
title = "Ensembles of Hybrid and End-to-End Speech Recognition.",
author = "Parikh, Aditya Kamlesh and
ten Bosch, Louis and
van den Heuvel, Henk",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.547/",
pages = "6199--6205",
abstract = "We propose a method to combine the hybrid Kaldi-based Automatic Speech Recognition (ASR) system with the end-to-end wav2vec 2.0 XLS-R ASR using confidence measures. Our research is focused on the low-resource Irish language. Given the limited available open-source resources, neither the standalone hybrid ASR nor the end-to-end ASR system can achieve optimal performance. By applying the Recognizer Output Voting Error Reduction (ROVER) technique, we illustrate how ensemble learning could facilitate mutual error correction between both ASR systems. This paper outlines the strategies for merging the hybrid Kaldi ASR model and the end-to-end XLS-R model with the help of confidence scores. Although contemporary state-of-the-art end-to-end ASR models face challenges related to prediction overconfidence, we utilize Renyi`s entropy-based confidence approach, tuned with temperature scaling, to align it with the Kaldi ASR confidence. Although there was no significant difference in the Word Error Rate (WER) between the hybrid and end-to-end ASR, we could achieve a notable reduction in WER after ensembling through ROVER. This resulted in an almost 14{\%} Word Error Rate Reduction (WERR) on our primary test set and an approximately 20{\%} WERR on other noisy and imbalanced test data."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="parikh-etal-2024-ensembles">
<titleInfo>
<title>Ensembles of Hybrid and End-to-End Speech Recognition.</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aditya</namePart>
<namePart type="given">Kamlesh</namePart>
<namePart type="family">Parikh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Louis</namePart>
<namePart type="family">ten Bosch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Henk</namePart>
<namePart type="family">van den Heuvel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We propose a method to combine the hybrid Kaldi-based Automatic Speech Recognition (ASR) system with the end-to-end wav2vec 2.0 XLS-R ASR using confidence measures. Our research is focused on the low-resource Irish language. Given the limited available open-source resources, neither the standalone hybrid ASR nor the end-to-end ASR system can achieve optimal performance. By applying the Recognizer Output Voting Error Reduction (ROVER) technique, we illustrate how ensemble learning could facilitate mutual error correction between both ASR systems. This paper outlines the strategies for merging the hybrid Kaldi ASR model and the end-to-end XLS-R model with the help of confidence scores. Although contemporary state-of-the-art end-to-end ASR models face challenges related to prediction overconfidence, we utilize Renyi‘s entropy-based confidence approach, tuned with temperature scaling, to align it with the Kaldi ASR confidence. Although there was no significant difference in the Word Error Rate (WER) between the hybrid and end-to-end ASR, we could achieve a notable reduction in WER after ensembling through ROVER. This resulted in an almost 14% Word Error Rate Reduction (WERR) on our primary test set and an approximately 20% WERR on other noisy and imbalanced test data.</abstract>
<identifier type="citekey">parikh-etal-2024-ensembles</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.547/</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>6199</start>
<end>6205</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Ensembles of Hybrid and End-to-End Speech Recognition.
%A Parikh, Aditya Kamlesh
%A ten Bosch, Louis
%A van den Heuvel, Henk
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F parikh-etal-2024-ensembles
%X We propose a method to combine the hybrid Kaldi-based Automatic Speech Recognition (ASR) system with the end-to-end wav2vec 2.0 XLS-R ASR using confidence measures. Our research is focused on the low-resource Irish language. Given the limited available open-source resources, neither the standalone hybrid ASR nor the end-to-end ASR system can achieve optimal performance. By applying the Recognizer Output Voting Error Reduction (ROVER) technique, we illustrate how ensemble learning could facilitate mutual error correction between both ASR systems. This paper outlines the strategies for merging the hybrid Kaldi ASR model and the end-to-end XLS-R model with the help of confidence scores. Although contemporary state-of-the-art end-to-end ASR models face challenges related to prediction overconfidence, we utilize Renyi‘s entropy-based confidence approach, tuned with temperature scaling, to align it with the Kaldi ASR confidence. Although there was no significant difference in the Word Error Rate (WER) between the hybrid and end-to-end ASR, we could achieve a notable reduction in WER after ensembling through ROVER. This resulted in an almost 14% Word Error Rate Reduction (WERR) on our primary test set and an approximately 20% WERR on other noisy and imbalanced test data.
%U https://aclanthology.org/2024.lrec-main.547/
%P 6199-6205
Markdown (Informal)
[Ensembles of Hybrid and End-to-End Speech Recognition.](https://aclanthology.org/2024.lrec-main.547/) (Parikh et al., LREC-COLING 2024)
ACL
- Aditya Kamlesh Parikh, Louis ten Bosch, and Henk van den Heuvel. 2024. Ensembles of Hybrid and End-to-End Speech Recognition.. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), pages 6199–6205, Torino, Italia. ELRA and ICCL.