@inproceedings{alastruey-etal-2022-locality,
title = "On the Locality of Attention in Direct Speech Translation",
author = "Alastruey, Belen and
Ferrando, Javier and
G{\'a}llego, Gerard I. and
Costa-juss{\`a}, Marta R.",
editor = "Louvan, Samuel and
Madotto, Andrea and
Madureira, Brielen",
booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics: Student Research Workshop",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.acl-srw.32",
doi = "10.18653/v1/2022.acl-srw.32",
pages = "402--412",
abstract = "Transformers have achieved state-of-the-art results across multiple NLP tasks. However, the self-attention mechanism complexity scales quadratically with the sequence length, creating an obstacle for tasks involving long sequences, like in the speech domain. In this paper, we discuss the usefulness of self-attention for Direct Speech Translation. First, we analyze the layer-wise token contributions in the self-attention of the encoder, unveiling local diagonal patterns. To prove that some attention weights are avoidable, we propose to substitute the standard self-attention with a local efficient one, setting the amount of context used based on the results of the analysis. With this approach, our model matches the baseline performance, and improves the efficiency by skipping the computation of those weights that standard attention discards.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="alastruey-etal-2022-locality">
<titleInfo>
<title>On the Locality of Attention in Direct Speech Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Belen</namePart>
<namePart type="family">Alastruey</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Javier</namePart>
<namePart type="family">Ferrando</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gerard</namePart>
<namePart type="given">I</namePart>
<namePart type="family">Gállego</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marta</namePart>
<namePart type="given">R</namePart>
<namePart type="family">Costa-jussà</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics: Student Research Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Samuel</namePart>
<namePart type="family">Louvan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrea</namePart>
<namePart type="family">Madotto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Brielen</namePart>
<namePart type="family">Madureira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Transformers have achieved state-of-the-art results across multiple NLP tasks. However, the self-attention mechanism complexity scales quadratically with the sequence length, creating an obstacle for tasks involving long sequences, like in the speech domain. In this paper, we discuss the usefulness of self-attention for Direct Speech Translation. First, we analyze the layer-wise token contributions in the self-attention of the encoder, unveiling local diagonal patterns. To prove that some attention weights are avoidable, we propose to substitute the standard self-attention with a local efficient one, setting the amount of context used based on the results of the analysis. With this approach, our model matches the baseline performance, and improves the efficiency by skipping the computation of those weights that standard attention discards.</abstract>
<identifier type="citekey">alastruey-etal-2022-locality</identifier>
<identifier type="doi">10.18653/v1/2022.acl-srw.32</identifier>
<location>
<url>https://aclanthology.org/2022.acl-srw.32</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>402</start>
<end>412</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T On the Locality of Attention in Direct Speech Translation
%A Alastruey, Belen
%A Ferrando, Javier
%A Gállego, Gerard I.
%A Costa-jussà, Marta R.
%Y Louvan, Samuel
%Y Madotto, Andrea
%Y Madureira, Brielen
%S Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics: Student Research Workshop
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F alastruey-etal-2022-locality
%X Transformers have achieved state-of-the-art results across multiple NLP tasks. However, the self-attention mechanism complexity scales quadratically with the sequence length, creating an obstacle for tasks involving long sequences, like in the speech domain. In this paper, we discuss the usefulness of self-attention for Direct Speech Translation. First, we analyze the layer-wise token contributions in the self-attention of the encoder, unveiling local diagonal patterns. To prove that some attention weights are avoidable, we propose to substitute the standard self-attention with a local efficient one, setting the amount of context used based on the results of the analysis. With this approach, our model matches the baseline performance, and improves the efficiency by skipping the computation of those weights that standard attention discards.
%R 10.18653/v1/2022.acl-srw.32
%U https://aclanthology.org/2022.acl-srw.32
%U https://doi.org/10.18653/v1/2022.acl-srw.32
%P 402-412
Markdown (Informal)
[On the Locality of Attention in Direct Speech Translation](https://aclanthology.org/2022.acl-srw.32) (Alastruey et al., ACL 2022)
ACL
- Belen Alastruey, Javier Ferrando, Gerard I. Gállego, and Marta R. Costa-jussà. 2022. On the Locality of Attention in Direct Speech Translation. In Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics: Student Research Workshop, pages 402–412, Dublin, Ireland. Association for Computational Linguistics.