@inproceedings{jayanthi-etal-2023-retrieve,
title = "Retrieve and Copy: Scaling {ASR} Personalization to Large Catalogs",
author = "Jayanthi, Sai Muralidhar and
Kulshreshtha, Devang and
Dingliwal, Saket and
Ronanki, Srikanth and
Bodapati, Sravan",
editor = "Wang, Mingxuan and
Zitouni, Imed",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-industry.60",
doi = "10.18653/v1/2023.emnlp-industry.60",
pages = "631--639",
abstract = "Personalization of automatic speech recognition (ASR) models is a widely studied topic because of its many practical applications. Most recently, attention-based contextual biasing techniques are used to improve the recognition of rare words and/or domain specific entities. However, due to performance constraints, the biasing is often limited to a few thousand entities, restricting real-world usability. To address this, we first propose a {``}Retrieve and Copy{''} mechanism to improve latency while retaining the accuracy even when scaled to a large catalog. We also propose a training strategy to overcome the degradation in recall at such scale due to an increased number of confusing entities. Overall, our approach achieves up to 6{\%} more Word Error Rate reduction (WERR) and 3.6{\%} absolute improvement in F1 when compared to a strong baseline. Our method also allows for large catalog sizes of up to 20K without significantly affecting WER and F1-scores, while achieving at least 20{\%} inference speedup per acoustic frame.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jayanthi-etal-2023-retrieve">
<titleInfo>
<title>Retrieve and Copy: Scaling ASR Personalization to Large Catalogs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sai</namePart>
<namePart type="given">Muralidhar</namePart>
<namePart type="family">Jayanthi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Devang</namePart>
<namePart type="family">Kulshreshtha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saket</namePart>
<namePart type="family">Dingliwal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Srikanth</namePart>
<namePart type="family">Ronanki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sravan</namePart>
<namePart type="family">Bodapati</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mingxuan</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Imed</namePart>
<namePart type="family">Zitouni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Personalization of automatic speech recognition (ASR) models is a widely studied topic because of its many practical applications. Most recently, attention-based contextual biasing techniques are used to improve the recognition of rare words and/or domain specific entities. However, due to performance constraints, the biasing is often limited to a few thousand entities, restricting real-world usability. To address this, we first propose a “Retrieve and Copy” mechanism to improve latency while retaining the accuracy even when scaled to a large catalog. We also propose a training strategy to overcome the degradation in recall at such scale due to an increased number of confusing entities. Overall, our approach achieves up to 6% more Word Error Rate reduction (WERR) and 3.6% absolute improvement in F1 when compared to a strong baseline. Our method also allows for large catalog sizes of up to 20K without significantly affecting WER and F1-scores, while achieving at least 20% inference speedup per acoustic frame.</abstract>
<identifier type="citekey">jayanthi-etal-2023-retrieve</identifier>
<identifier type="doi">10.18653/v1/2023.emnlp-industry.60</identifier>
<location>
<url>https://aclanthology.org/2023.emnlp-industry.60</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>631</start>
<end>639</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Retrieve and Copy: Scaling ASR Personalization to Large Catalogs
%A Jayanthi, Sai Muralidhar
%A Kulshreshtha, Devang
%A Dingliwal, Saket
%A Ronanki, Srikanth
%A Bodapati, Sravan
%Y Wang, Mingxuan
%Y Zitouni, Imed
%S Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F jayanthi-etal-2023-retrieve
%X Personalization of automatic speech recognition (ASR) models is a widely studied topic because of its many practical applications. Most recently, attention-based contextual biasing techniques are used to improve the recognition of rare words and/or domain specific entities. However, due to performance constraints, the biasing is often limited to a few thousand entities, restricting real-world usability. To address this, we first propose a “Retrieve and Copy” mechanism to improve latency while retaining the accuracy even when scaled to a large catalog. We also propose a training strategy to overcome the degradation in recall at such scale due to an increased number of confusing entities. Overall, our approach achieves up to 6% more Word Error Rate reduction (WERR) and 3.6% absolute improvement in F1 when compared to a strong baseline. Our method also allows for large catalog sizes of up to 20K without significantly affecting WER and F1-scores, while achieving at least 20% inference speedup per acoustic frame.
%R 10.18653/v1/2023.emnlp-industry.60
%U https://aclanthology.org/2023.emnlp-industry.60
%U https://doi.org/10.18653/v1/2023.emnlp-industry.60
%P 631-639
Markdown (Informal)
[Retrieve and Copy: Scaling ASR Personalization to Large Catalogs](https://aclanthology.org/2023.emnlp-industry.60) (Jayanthi et al., EMNLP 2023)
ACL
- Sai Muralidhar Jayanthi, Devang Kulshreshtha, Saket Dingliwal, Srikanth Ronanki, and Sravan Bodapati. 2023. Retrieve and Copy: Scaling ASR Personalization to Large Catalogs. In Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: Industry Track, pages 631–639, Singapore. Association for Computational Linguistics.