@inproceedings{dongge-yang-2023-mscffn,
title = "{MSCFFN}: A New {FFN} with Multi-Space Cross to Accelerate Transformer",
author = "Dongge, Tang and
Yang, Qing",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-emnlp.1017",
doi = "10.18653/v1/2023.findings-emnlp.1017",
pages = "15234--15239",
abstract = "Transformer models have achieved impressive success in various natural language processing tasks. But it is also limited used in some areas and the heavy computation complexity is one of the main limitations. Many model structures have been proposed to reduce the computation complexity and some are really effective. The previous research can be divided into two categories. One is to use more effective training and inference strategies and the other is focused on how to replace the standard self-attention mechanism with linear attention method. Differently, we revisit the design in Transformer and find that the feed forward network (FFN) is also computationally expensive, especially when the hidden dimension is large. In this paper, we propose a new FFN structure, named MSCFFN, which splits the large matrix space to several small space to reduce the computation complexity and uses the Multi-Space Cross method to ensure the accurate result. To the best of our knowledge, this is the first time to redesign FFN to accelerate Transformers. We experimentally validate the effectiveness of the proposed method on the Long-Range Arena benchmark. And the results show MSCFFN can achieve a faster speed with a similar or even better accuracy.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dongge-yang-2023-mscffn">
<titleInfo>
<title>MSCFFN: A New FFN with Multi-Space Cross to Accelerate Transformer</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tang</namePart>
<namePart type="family">Dongge</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qing</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Transformer models have achieved impressive success in various natural language processing tasks. But it is also limited used in some areas and the heavy computation complexity is one of the main limitations. Many model structures have been proposed to reduce the computation complexity and some are really effective. The previous research can be divided into two categories. One is to use more effective training and inference strategies and the other is focused on how to replace the standard self-attention mechanism with linear attention method. Differently, we revisit the design in Transformer and find that the feed forward network (FFN) is also computationally expensive, especially when the hidden dimension is large. In this paper, we propose a new FFN structure, named MSCFFN, which splits the large matrix space to several small space to reduce the computation complexity and uses the Multi-Space Cross method to ensure the accurate result. To the best of our knowledge, this is the first time to redesign FFN to accelerate Transformers. We experimentally validate the effectiveness of the proposed method on the Long-Range Arena benchmark. And the results show MSCFFN can achieve a faster speed with a similar or even better accuracy.</abstract>
<identifier type="citekey">dongge-yang-2023-mscffn</identifier>
<identifier type="doi">10.18653/v1/2023.findings-emnlp.1017</identifier>
<location>
<url>https://aclanthology.org/2023.findings-emnlp.1017</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>15234</start>
<end>15239</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MSCFFN: A New FFN with Multi-Space Cross to Accelerate Transformer
%A Dongge, Tang
%A Yang, Qing
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Findings of the Association for Computational Linguistics: EMNLP 2023
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F dongge-yang-2023-mscffn
%X Transformer models have achieved impressive success in various natural language processing tasks. But it is also limited used in some areas and the heavy computation complexity is one of the main limitations. Many model structures have been proposed to reduce the computation complexity and some are really effective. The previous research can be divided into two categories. One is to use more effective training and inference strategies and the other is focused on how to replace the standard self-attention mechanism with linear attention method. Differently, we revisit the design in Transformer and find that the feed forward network (FFN) is also computationally expensive, especially when the hidden dimension is large. In this paper, we propose a new FFN structure, named MSCFFN, which splits the large matrix space to several small space to reduce the computation complexity and uses the Multi-Space Cross method to ensure the accurate result. To the best of our knowledge, this is the first time to redesign FFN to accelerate Transformers. We experimentally validate the effectiveness of the proposed method on the Long-Range Arena benchmark. And the results show MSCFFN can achieve a faster speed with a similar or even better accuracy.
%R 10.18653/v1/2023.findings-emnlp.1017
%U https://aclanthology.org/2023.findings-emnlp.1017
%U https://doi.org/10.18653/v1/2023.findings-emnlp.1017
%P 15234-15239
Markdown (Informal)
[MSCFFN: A New FFN with Multi-Space Cross to Accelerate Transformer](https://aclanthology.org/2023.findings-emnlp.1017) (Dongge & Yang, Findings 2023)
ACL