@inproceedings{pietruszka-etal-2022-sparsifying,
title = "Sparsifying Transformer Models with Trainable Representation Pooling",
author = "Pietruszka, Micha{\l} and
Borchmann, {\L}ukasz and
Garncarek, {\L}ukasz",
editor = "Muresan, Smaranda and
Nakov, Preslav and
Villavicencio, Aline",
booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.acl-long.590",
doi = "10.18653/v1/2022.acl-long.590",
pages = "8616--8633",
abstract = "We propose a novel method to sparsify attention in the Transformer model by learning to select the most-informative token representations during the training process, thus focusing on the task-specific parts of an input. A reduction of quadratic time and memory complexity to sublinear was achieved due to a robust trainable top-$k$ operator.Our experiments on a challenging long document summarization task show that even our simple baseline performs comparably to the current SOTA, and with trainable pooling we can retain its top quality, while being $1.8\times$ faster during training, $4.5\times$ faster during inference, and up to $13\times$ more computationally efficient in the decoder.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pietruszka-etal-2022-sparsifying">
<titleInfo>
<title>Sparsifying Transformer Models with Trainable Representation Pooling</title>
</titleInfo>
<name type="personal">
<namePart type="given">Michał</namePart>
<namePart type="family">Pietruszka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Łukasz</namePart>
<namePart type="family">Borchmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Łukasz</namePart>
<namePart type="family">Garncarek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Smaranda</namePart>
<namePart type="family">Muresan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aline</namePart>
<namePart type="family">Villavicencio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We propose a novel method to sparsify attention in the Transformer model by learning to select the most-informative token representations during the training process, thus focusing on the task-specific parts of an input. A reduction of quadratic time and memory complexity to sublinear was achieved due to a robust trainable top-k operator.Our experiments on a challenging long document summarization task show that even our simple baseline performs comparably to the current SOTA, and with trainable pooling we can retain its top quality, while being 1.8\times faster during training, 4.5\times faster during inference, and up to 13\times more computationally efficient in the decoder.</abstract>
<identifier type="citekey">pietruszka-etal-2022-sparsifying</identifier>
<identifier type="doi">10.18653/v1/2022.acl-long.590</identifier>
<location>
<url>https://aclanthology.org/2022.acl-long.590</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>8616</start>
<end>8633</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Sparsifying Transformer Models with Trainable Representation Pooling
%A Pietruszka, Michał
%A Borchmann, Łukasz
%A Garncarek, Łukasz
%Y Muresan, Smaranda
%Y Nakov, Preslav
%Y Villavicencio, Aline
%S Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F pietruszka-etal-2022-sparsifying
%X We propose a novel method to sparsify attention in the Transformer model by learning to select the most-informative token representations during the training process, thus focusing on the task-specific parts of an input. A reduction of quadratic time and memory complexity to sublinear was achieved due to a robust trainable top-k operator.Our experiments on a challenging long document summarization task show that even our simple baseline performs comparably to the current SOTA, and with trainable pooling we can retain its top quality, while being 1.8\times faster during training, 4.5\times faster during inference, and up to 13\times more computationally efficient in the decoder.
%R 10.18653/v1/2022.acl-long.590
%U https://aclanthology.org/2022.acl-long.590
%U https://doi.org/10.18653/v1/2022.acl-long.590
%P 8616-8633
Markdown (Informal)
[Sparsifying Transformer Models with Trainable Representation Pooling](https://aclanthology.org/2022.acl-long.590) (Pietruszka et al., ACL 2022)
ACL