@inproceedings{zhang-etal-2023-adaptive,
title = "Adaptive Attention for Sparse-based Long-sequence Transformer",
author = "Zhang, Xuanyu and
Lv, Zhepeng and
Yang, Qing",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-acl.546",
doi = "10.18653/v1/2023.findings-acl.546",
pages = "8602--8610",
abstract = "Recently, Transformers have been widely used in various fields and have achieved remarkable results. But it is still difficult for Transformer-based models to process longer sequences because self-attention in them scales quadratically with the sequence length. Although some models attempt to use sparse attention to reduce computational complexity, hand-crafted attention patterns are unable to select useful tokens adaptively according to the context. Thus, in this paper, we propose a novel efficient Transformer model with adaptive attention, A2-Former, for long sequence modeling. It can select useful tokens automatically in sparse attention by learnable position vectors, which consist of meta position and offset position vectors. Because the learnable offset position is not an integer vector, we utilize the interpolation technique to gather corresponding vectors from the input embedding matrix by discrete indexes. Experiments on Long Range Arena (LRA), a systematic and unified benchmark with different tasks, show that our model has achieved further improvement in performance compared with other sparse-based Transformers.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2023-adaptive">
<titleInfo>
<title>Adaptive Attention for Sparse-based Long-sequence Transformer</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xuanyu</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhepeng</namePart>
<namePart type="family">Lv</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qing</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recently, Transformers have been widely used in various fields and have achieved remarkable results. But it is still difficult for Transformer-based models to process longer sequences because self-attention in them scales quadratically with the sequence length. Although some models attempt to use sparse attention to reduce computational complexity, hand-crafted attention patterns are unable to select useful tokens adaptively according to the context. Thus, in this paper, we propose a novel efficient Transformer model with adaptive attention, A2-Former, for long sequence modeling. It can select useful tokens automatically in sparse attention by learnable position vectors, which consist of meta position and offset position vectors. Because the learnable offset position is not an integer vector, we utilize the interpolation technique to gather corresponding vectors from the input embedding matrix by discrete indexes. Experiments on Long Range Arena (LRA), a systematic and unified benchmark with different tasks, show that our model has achieved further improvement in performance compared with other sparse-based Transformers.</abstract>
<identifier type="citekey">zhang-etal-2023-adaptive</identifier>
<identifier type="doi">10.18653/v1/2023.findings-acl.546</identifier>
<location>
<url>https://aclanthology.org/2023.findings-acl.546</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>8602</start>
<end>8610</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Adaptive Attention for Sparse-based Long-sequence Transformer
%A Zhang, Xuanyu
%A Lv, Zhepeng
%A Yang, Qing
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Findings of the Association for Computational Linguistics: ACL 2023
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F zhang-etal-2023-adaptive
%X Recently, Transformers have been widely used in various fields and have achieved remarkable results. But it is still difficult for Transformer-based models to process longer sequences because self-attention in them scales quadratically with the sequence length. Although some models attempt to use sparse attention to reduce computational complexity, hand-crafted attention patterns are unable to select useful tokens adaptively according to the context. Thus, in this paper, we propose a novel efficient Transformer model with adaptive attention, A2-Former, for long sequence modeling. It can select useful tokens automatically in sparse attention by learnable position vectors, which consist of meta position and offset position vectors. Because the learnable offset position is not an integer vector, we utilize the interpolation technique to gather corresponding vectors from the input embedding matrix by discrete indexes. Experiments on Long Range Arena (LRA), a systematic and unified benchmark with different tasks, show that our model has achieved further improvement in performance compared with other sparse-based Transformers.
%R 10.18653/v1/2023.findings-acl.546
%U https://aclanthology.org/2023.findings-acl.546
%U https://doi.org/10.18653/v1/2023.findings-acl.546
%P 8602-8610
Markdown (Informal)
[Adaptive Attention for Sparse-based Long-sequence Transformer](https://aclanthology.org/2023.findings-acl.546) (Zhang et al., Findings 2023)
ACL