@inproceedings{vuong-etal-2023-adabert,
title = "{A}da{BERT}-{CTC}: Leveraging {BERT}-{CTC} for Text-Only Domain Adaptation in {ASR}",
author = "Vuong, Tyler and
Mundnich, Karel and
Bekal, Dhanush and
Elluru, Veera and
Ronanki, Srikanth and
Bodapati, Sravan",
editor = "Wang, Mingxuan and
Zitouni, Imed",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-industry.35",
doi = "10.18653/v1/2023.emnlp-industry.35",
pages = "364--371",
abstract = "End-to-end (E2E) automatic speech recognition (ASR) models are becoming increasingly popular in commercial applications, such as virtual assistants, closed captioning, and dictation systems. The accuracy of the ASR is crucial to their success. However, E2E models still struggle to recognize out-of-domain words such as proper nouns and domain-specific terms. In this paper we introduce AdaBERT-CTC, a domain adaptation technique that relies solely on textual data. Our method allows for text-only adaptation by fine-tuning a pre-trained self-supervised text encoder model. Additionally, we show that our method can be made parameter-efficient by adding bottleneck adapters to the pre-trained model. This allows for adaptation with less than a 5{\%} increase in parameters and minimal computational overhead during inference. We demonstrate that our approach outperforms the base BERT-CTC model by up to 14{\%} relative word error rate improvement on several out-of-domain, publicly available datasets.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="vuong-etal-2023-adabert">
<titleInfo>
<title>AdaBERT-CTC: Leveraging BERT-CTC for Text-Only Domain Adaptation in ASR</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tyler</namePart>
<namePart type="family">Vuong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karel</namePart>
<namePart type="family">Mundnich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dhanush</namePart>
<namePart type="family">Bekal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veera</namePart>
<namePart type="family">Elluru</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Srikanth</namePart>
<namePart type="family">Ronanki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sravan</namePart>
<namePart type="family">Bodapati</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mingxuan</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Imed</namePart>
<namePart type="family">Zitouni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>End-to-end (E2E) automatic speech recognition (ASR) models are becoming increasingly popular in commercial applications, such as virtual assistants, closed captioning, and dictation systems. The accuracy of the ASR is crucial to their success. However, E2E models still struggle to recognize out-of-domain words such as proper nouns and domain-specific terms. In this paper we introduce AdaBERT-CTC, a domain adaptation technique that relies solely on textual data. Our method allows for text-only adaptation by fine-tuning a pre-trained self-supervised text encoder model. Additionally, we show that our method can be made parameter-efficient by adding bottleneck adapters to the pre-trained model. This allows for adaptation with less than a 5% increase in parameters and minimal computational overhead during inference. We demonstrate that our approach outperforms the base BERT-CTC model by up to 14% relative word error rate improvement on several out-of-domain, publicly available datasets.</abstract>
<identifier type="citekey">vuong-etal-2023-adabert</identifier>
<identifier type="doi">10.18653/v1/2023.emnlp-industry.35</identifier>
<location>
<url>https://aclanthology.org/2023.emnlp-industry.35</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>364</start>
<end>371</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T AdaBERT-CTC: Leveraging BERT-CTC for Text-Only Domain Adaptation in ASR
%A Vuong, Tyler
%A Mundnich, Karel
%A Bekal, Dhanush
%A Elluru, Veera
%A Ronanki, Srikanth
%A Bodapati, Sravan
%Y Wang, Mingxuan
%Y Zitouni, Imed
%S Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F vuong-etal-2023-adabert
%X End-to-end (E2E) automatic speech recognition (ASR) models are becoming increasingly popular in commercial applications, such as virtual assistants, closed captioning, and dictation systems. The accuracy of the ASR is crucial to their success. However, E2E models still struggle to recognize out-of-domain words such as proper nouns and domain-specific terms. In this paper we introduce AdaBERT-CTC, a domain adaptation technique that relies solely on textual data. Our method allows for text-only adaptation by fine-tuning a pre-trained self-supervised text encoder model. Additionally, we show that our method can be made parameter-efficient by adding bottleneck adapters to the pre-trained model. This allows for adaptation with less than a 5% increase in parameters and minimal computational overhead during inference. We demonstrate that our approach outperforms the base BERT-CTC model by up to 14% relative word error rate improvement on several out-of-domain, publicly available datasets.
%R 10.18653/v1/2023.emnlp-industry.35
%U https://aclanthology.org/2023.emnlp-industry.35
%U https://doi.org/10.18653/v1/2023.emnlp-industry.35
%P 364-371
Markdown (Informal)
[AdaBERT-CTC: Leveraging BERT-CTC for Text-Only Domain Adaptation in ASR](https://aclanthology.org/2023.emnlp-industry.35) (Vuong et al., EMNLP 2023)
ACL