@inproceedings{hrinchuk-etal-2022-nvidia,
title = "{NVIDIA} {N}e{M}o Offline Speech Translation Systems for {IWSLT} 2022",
author = "Hrinchuk, Oleksii and
Noroozi, Vahid and
Khattar, Abhinav and
Peganov, Anton and
Subramanian, Sandeep and
Majumdar, Somshubra and
Kuchaiev, Oleksii",
editor = "Salesky, Elizabeth and
Federico, Marcello and
Costa-juss{\`a}, Marta",
booktitle = "Proceedings of the 19th International Conference on Spoken Language Translation (IWSLT 2022)",
month = may,
year = "2022",
address = "Dublin, Ireland (in-person and online)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.iwslt-1.18",
doi = "10.18653/v1/2022.iwslt-1.18",
pages = "225--231",
abstract = "This paper provides an overview of NVIDIA NeMo{'}s speech translation systems for the IWSLT 2022 Offline Speech Translation Task. Our cascade system consists of 1) Conformer RNN-T automatic speech recognition model, 2) punctuation-capitalization model based on pre-trained T5 encoder, 3) ensemble of Transformer neural machine translation models fine-tuned on TED talks. Our end-to-end model has less parameters and consists of Conformer encoder and Transformer decoder. It relies on the cascade system by re-using its pre-trained ASR encoder and training on synthetic translations generated with the ensemble of NMT models. Our En-{\textgreater}De cascade and end-to-end systems achieve 29.7 and 26.2 BLEU on the 2020 test set correspondingly, both outperforming the previous year{'}s best of 26 BLEU.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hrinchuk-etal-2022-nvidia">
<titleInfo>
<title>NVIDIA NeMo Offline Speech Translation Systems for IWSLT 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Oleksii</namePart>
<namePart type="family">Hrinchuk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vahid</namePart>
<namePart type="family">Noroozi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abhinav</namePart>
<namePart type="family">Khattar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anton</namePart>
<namePart type="family">Peganov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sandeep</namePart>
<namePart type="family">Subramanian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Somshubra</namePart>
<namePart type="family">Majumdar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oleksii</namePart>
<namePart type="family">Kuchaiev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th International Conference on Spoken Language Translation (IWSLT 2022)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Elizabeth</namePart>
<namePart type="family">Salesky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcello</namePart>
<namePart type="family">Federico</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marta</namePart>
<namePart type="family">Costa-jussà</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland (in-person and online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper provides an overview of NVIDIA NeMo’s speech translation systems for the IWSLT 2022 Offline Speech Translation Task. Our cascade system consists of 1) Conformer RNN-T automatic speech recognition model, 2) punctuation-capitalization model based on pre-trained T5 encoder, 3) ensemble of Transformer neural machine translation models fine-tuned on TED talks. Our end-to-end model has less parameters and consists of Conformer encoder and Transformer decoder. It relies on the cascade system by re-using its pre-trained ASR encoder and training on synthetic translations generated with the ensemble of NMT models. Our En-\textgreaterDe cascade and end-to-end systems achieve 29.7 and 26.2 BLEU on the 2020 test set correspondingly, both outperforming the previous year’s best of 26 BLEU.</abstract>
<identifier type="citekey">hrinchuk-etal-2022-nvidia</identifier>
<identifier type="doi">10.18653/v1/2022.iwslt-1.18</identifier>
<location>
<url>https://aclanthology.org/2022.iwslt-1.18</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>225</start>
<end>231</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T NVIDIA NeMo Offline Speech Translation Systems for IWSLT 2022
%A Hrinchuk, Oleksii
%A Noroozi, Vahid
%A Khattar, Abhinav
%A Peganov, Anton
%A Subramanian, Sandeep
%A Majumdar, Somshubra
%A Kuchaiev, Oleksii
%Y Salesky, Elizabeth
%Y Federico, Marcello
%Y Costa-jussà, Marta
%S Proceedings of the 19th International Conference on Spoken Language Translation (IWSLT 2022)
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland (in-person and online)
%F hrinchuk-etal-2022-nvidia
%X This paper provides an overview of NVIDIA NeMo’s speech translation systems for the IWSLT 2022 Offline Speech Translation Task. Our cascade system consists of 1) Conformer RNN-T automatic speech recognition model, 2) punctuation-capitalization model based on pre-trained T5 encoder, 3) ensemble of Transformer neural machine translation models fine-tuned on TED talks. Our end-to-end model has less parameters and consists of Conformer encoder and Transformer decoder. It relies on the cascade system by re-using its pre-trained ASR encoder and training on synthetic translations generated with the ensemble of NMT models. Our En-\textgreaterDe cascade and end-to-end systems achieve 29.7 and 26.2 BLEU on the 2020 test set correspondingly, both outperforming the previous year’s best of 26 BLEU.
%R 10.18653/v1/2022.iwslt-1.18
%U https://aclanthology.org/2022.iwslt-1.18
%U https://doi.org/10.18653/v1/2022.iwslt-1.18
%P 225-231
Markdown (Informal)
[NVIDIA NeMo Offline Speech Translation Systems for IWSLT 2022](https://aclanthology.org/2022.iwslt-1.18) (Hrinchuk et al., IWSLT 2022)
ACL
- Oleksii Hrinchuk, Vahid Noroozi, Abhinav Khattar, Anton Peganov, Sandeep Subramanian, Somshubra Majumdar, and Oleksii Kuchaiev. 2022. NVIDIA NeMo Offline Speech Translation Systems for IWSLT 2022. In Proceedings of the 19th International Conference on Spoken Language Translation (IWSLT 2022), pages 225–231, Dublin, Ireland (in-person and online). Association for Computational Linguistics.