@inproceedings{li-etal-2024-cb,
title = "{CB}-Whisper: Contextual Biasing Whisper Using Open-Vocabulary Keyword-Spotting",
author = "Li, Yuang and
Li, Yinglu and
Zhang, Min and
Su, Chang and
Yu, Jiawei and
Piao, Mengyao and
Qiao, Xiaosong and
Ma, Miaomiao and
Zhao, Yanqing and
Yang, Hao",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.262",
pages = "2941--2946",
abstract = "End-to-end automatic speech recognition (ASR) systems often struggle to recognize rare name entities, such as personal names, organizations and terminologies that are not frequently encountered in the training data. This paper presents Contextual Biasing Whisper (CB-Whisper), a novel ASR system based on OpenAI{'}s Whisper model that can recognize user-defined name entities by performing open-vocabulary keyword-spotting (KWS) before the decoder. The KWS module leverages text-to-speech (TTS) techniques and a convolutional neural network (CNN) classifier to match the features between the entities and the utterances. To integrate the recognized entities into the Whipser decoder and avoid hallucinations, we carefully crafted multiple prompts with spoken form hints. Experiments show that the KWS module based on Whisper encoder{'}s features can recognize unseen user-defined keywords effectively. More importantly, the proposed CB-Whisper substantially improves the mixed-error-rate (MER) and entity recall compared to the original Whisper model on three internal datasets and two publicly available datasets including Aishell and ACL datasets that cover English-only, Chinese-only, and code-switching scenarios.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="li-etal-2024-cb">
<titleInfo>
<title>CB-Whisper: Contextual Biasing Whisper Using Open-Vocabulary Keyword-Spotting</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yuang</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yinglu</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chang</namePart>
<namePart type="family">Su</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiawei</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mengyao</namePart>
<namePart type="family">Piao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaosong</namePart>
<namePart type="family">Qiao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miaomiao</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yanqing</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hao</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>End-to-end automatic speech recognition (ASR) systems often struggle to recognize rare name entities, such as personal names, organizations and terminologies that are not frequently encountered in the training data. This paper presents Contextual Biasing Whisper (CB-Whisper), a novel ASR system based on OpenAI’s Whisper model that can recognize user-defined name entities by performing open-vocabulary keyword-spotting (KWS) before the decoder. The KWS module leverages text-to-speech (TTS) techniques and a convolutional neural network (CNN) classifier to match the features between the entities and the utterances. To integrate the recognized entities into the Whipser decoder and avoid hallucinations, we carefully crafted multiple prompts with spoken form hints. Experiments show that the KWS module based on Whisper encoder’s features can recognize unseen user-defined keywords effectively. More importantly, the proposed CB-Whisper substantially improves the mixed-error-rate (MER) and entity recall compared to the original Whisper model on three internal datasets and two publicly available datasets including Aishell and ACL datasets that cover English-only, Chinese-only, and code-switching scenarios.</abstract>
<identifier type="citekey">li-etal-2024-cb</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.262</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>2941</start>
<end>2946</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CB-Whisper: Contextual Biasing Whisper Using Open-Vocabulary Keyword-Spotting
%A Li, Yuang
%A Li, Yinglu
%A Zhang, Min
%A Su, Chang
%A Yu, Jiawei
%A Piao, Mengyao
%A Qiao, Xiaosong
%A Ma, Miaomiao
%A Zhao, Yanqing
%A Yang, Hao
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F li-etal-2024-cb
%X End-to-end automatic speech recognition (ASR) systems often struggle to recognize rare name entities, such as personal names, organizations and terminologies that are not frequently encountered in the training data. This paper presents Contextual Biasing Whisper (CB-Whisper), a novel ASR system based on OpenAI’s Whisper model that can recognize user-defined name entities by performing open-vocabulary keyword-spotting (KWS) before the decoder. The KWS module leverages text-to-speech (TTS) techniques and a convolutional neural network (CNN) classifier to match the features between the entities and the utterances. To integrate the recognized entities into the Whipser decoder and avoid hallucinations, we carefully crafted multiple prompts with spoken form hints. Experiments show that the KWS module based on Whisper encoder’s features can recognize unseen user-defined keywords effectively. More importantly, the proposed CB-Whisper substantially improves the mixed-error-rate (MER) and entity recall compared to the original Whisper model on three internal datasets and two publicly available datasets including Aishell and ACL datasets that cover English-only, Chinese-only, and code-switching scenarios.
%U https://aclanthology.org/2024.lrec-main.262
%P 2941-2946
Markdown (Informal)
[CB-Whisper: Contextual Biasing Whisper Using Open-Vocabulary Keyword-Spotting](https://aclanthology.org/2024.lrec-main.262) (Li et al., LREC-COLING 2024)
ACL
- Yuang Li, Yinglu Li, Min Zhang, Chang Su, Jiawei Yu, Mengyao Piao, Xiaosong Qiao, Miaomiao Ma, Yanqing Zhao, and Hao Yang. 2024. CB-Whisper: Contextual Biasing Whisper Using Open-Vocabulary Keyword-Spotting. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), pages 2941–2946, Torino, Italia. ELRA and ICCL.