@inproceedings{peng-etal-2021-shrinking,
title = "Shrinking Bigfoot: Reducing wav2vec 2.0 footprint",
author = "Peng, Zilun and
Budhkar, Akshay and
Tuil, Ilana and
Levy, Jason and
Sobhani, Parinaz and
Cohen, Raphael and
Nassour, Jumana",
editor = "Moosavi, Nafise Sadat and
Gurevych, Iryna and
Fan, Angela and
Wolf, Thomas and
Hou, Yufang and
Marasovi{\'c}, Ana and
Ravi, Sujith",
booktitle = "Proceedings of the Second Workshop on Simple and Efficient Natural Language Processing",
month = nov,
year = "2021",
address = "Virtual",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.sustainlp-1.14/",
doi = "10.18653/v1/2021.sustainlp-1.14",
pages = "134--141",
abstract = "Wav2vec 2.0 is a state-of-the-art speech recognition model which maps speech audio waveforms into latent representations. The largest version of wav2vec 2.0 contains 317 million parameters. Hence, the inference latency of wav2vec 2.0 will be a bottleneck in production, leading to high costs and a significant environmental footprint. To improve wav2vec`s applicability to a production setting, we explore multiple model compression methods borrowed from the domain of large language models. Using a teacher-student approach, we distilled the knowledge from the original wav2vec 2.0 model into a student model, which is 2 times faster, 4.8 times smaller than the original model. More importantly, the student model is 2 times more energy efficient than the original model in terms of CO2 emission. This increase in performance is accomplished with only a 7{\%} degradation in word error rate (WER). Our quantized model is 3.6 times smaller than the original model, with only a 0.1{\%} degradation in WER. To the best of our knowledge, this is the first work that compresses wav2vec 2.0."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="peng-etal-2021-shrinking">
<titleInfo>
<title>Shrinking Bigfoot: Reducing wav2vec 2.0 footprint</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zilun</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Akshay</namePart>
<namePart type="family">Budhkar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ilana</namePart>
<namePart type="family">Tuil</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jason</namePart>
<namePart type="family">Levy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Parinaz</namePart>
<namePart type="family">Sobhani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raphael</namePart>
<namePart type="family">Cohen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jumana</namePart>
<namePart type="family">Nassour</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on Simple and Efficient Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nafise</namePart>
<namePart type="given">Sadat</namePart>
<namePart type="family">Moosavi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iryna</namePart>
<namePart type="family">Gurevych</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Angela</namePart>
<namePart type="family">Fan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Wolf</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yufang</namePart>
<namePart type="family">Hou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ana</namePart>
<namePart type="family">Marasović</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sujith</namePart>
<namePart type="family">Ravi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Virtual</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Wav2vec 2.0 is a state-of-the-art speech recognition model which maps speech audio waveforms into latent representations. The largest version of wav2vec 2.0 contains 317 million parameters. Hence, the inference latency of wav2vec 2.0 will be a bottleneck in production, leading to high costs and a significant environmental footprint. To improve wav2vec‘s applicability to a production setting, we explore multiple model compression methods borrowed from the domain of large language models. Using a teacher-student approach, we distilled the knowledge from the original wav2vec 2.0 model into a student model, which is 2 times faster, 4.8 times smaller than the original model. More importantly, the student model is 2 times more energy efficient than the original model in terms of CO2 emission. This increase in performance is accomplished with only a 7% degradation in word error rate (WER). Our quantized model is 3.6 times smaller than the original model, with only a 0.1% degradation in WER. To the best of our knowledge, this is the first work that compresses wav2vec 2.0.</abstract>
<identifier type="citekey">peng-etal-2021-shrinking</identifier>
<identifier type="doi">10.18653/v1/2021.sustainlp-1.14</identifier>
<location>
<url>https://aclanthology.org/2021.sustainlp-1.14/</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>134</start>
<end>141</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Shrinking Bigfoot: Reducing wav2vec 2.0 footprint
%A Peng, Zilun
%A Budhkar, Akshay
%A Tuil, Ilana
%A Levy, Jason
%A Sobhani, Parinaz
%A Cohen, Raphael
%A Nassour, Jumana
%Y Moosavi, Nafise Sadat
%Y Gurevych, Iryna
%Y Fan, Angela
%Y Wolf, Thomas
%Y Hou, Yufang
%Y Marasović, Ana
%Y Ravi, Sujith
%S Proceedings of the Second Workshop on Simple and Efficient Natural Language Processing
%D 2021
%8 November
%I Association for Computational Linguistics
%C Virtual
%F peng-etal-2021-shrinking
%X Wav2vec 2.0 is a state-of-the-art speech recognition model which maps speech audio waveforms into latent representations. The largest version of wav2vec 2.0 contains 317 million parameters. Hence, the inference latency of wav2vec 2.0 will be a bottleneck in production, leading to high costs and a significant environmental footprint. To improve wav2vec‘s applicability to a production setting, we explore multiple model compression methods borrowed from the domain of large language models. Using a teacher-student approach, we distilled the knowledge from the original wav2vec 2.0 model into a student model, which is 2 times faster, 4.8 times smaller than the original model. More importantly, the student model is 2 times more energy efficient than the original model in terms of CO2 emission. This increase in performance is accomplished with only a 7% degradation in word error rate (WER). Our quantized model is 3.6 times smaller than the original model, with only a 0.1% degradation in WER. To the best of our knowledge, this is the first work that compresses wav2vec 2.0.
%R 10.18653/v1/2021.sustainlp-1.14
%U https://aclanthology.org/2021.sustainlp-1.14/
%U https://doi.org/10.18653/v1/2021.sustainlp-1.14
%P 134-141
Markdown (Informal)
[Shrinking Bigfoot: Reducing wav2vec 2.0 footprint](https://aclanthology.org/2021.sustainlp-1.14/) (Peng et al., sustainlp 2021)
ACL
- Zilun Peng, Akshay Budhkar, Ilana Tuil, Jason Levy, Parinaz Sobhani, Raphael Cohen, and Jumana Nassour. 2021. Shrinking Bigfoot: Reducing wav2vec 2.0 footprint. In Proceedings of the Second Workshop on Simple and Efficient Natural Language Processing, pages 134–141, Virtual. Association for Computational Linguistics.