@inproceedings{wei-etal-2024-structured,
title = "Structured Optimal Brain Pruning for Large Language Models",
author = "Wei, Jiateng and
Lu, Quan and
Jiang, Ning and
Li, Siqi and
Xiang, Jingyang and
Chen, Jun and
Liu, Yong",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.775",
doi = "10.18653/v1/2024.emnlp-main.775",
pages = "13991--14007",
abstract = "The massive parameters and computational demands hinder the widespread application of Large Language Models (LLMs). Network pruning provides a practical solution to this problem. However, existing pruning works for LLMs mainly focus on unstructured pruning or necessitate post-pruning fine-tuning. The former relies on special hardware to accelerate computation, while the latter may need substantial computational resources. In this paper, we introduce a retraining-free structured pruning method called SoBP (Structured Optimal Brain Pruning). It leverages global first-order information to select pruning structures, then refines them with a local greedy approach, and finally adopts module-wise reconstruction to mitigate information loss. We assess the effectiveness of SoBP across 14 models from 3 LLM families on 8 distinct datasets. Experimental results demonstrate that SoBP outperforms current state-of-the-art methods.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wei-etal-2024-structured">
<titleInfo>
<title>Structured Optimal Brain Pruning for Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jiateng</namePart>
<namePart type="family">Wei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Quan</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ning</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Siqi</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jingyang</namePart>
<namePart type="family">Xiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jun</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yong</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The massive parameters and computational demands hinder the widespread application of Large Language Models (LLMs). Network pruning provides a practical solution to this problem. However, existing pruning works for LLMs mainly focus on unstructured pruning or necessitate post-pruning fine-tuning. The former relies on special hardware to accelerate computation, while the latter may need substantial computational resources. In this paper, we introduce a retraining-free structured pruning method called SoBP (Structured Optimal Brain Pruning). It leverages global first-order information to select pruning structures, then refines them with a local greedy approach, and finally adopts module-wise reconstruction to mitigate information loss. We assess the effectiveness of SoBP across 14 models from 3 LLM families on 8 distinct datasets. Experimental results demonstrate that SoBP outperforms current state-of-the-art methods.</abstract>
<identifier type="citekey">wei-etal-2024-structured</identifier>
<identifier type="doi">10.18653/v1/2024.emnlp-main.775</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.775</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>13991</start>
<end>14007</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Structured Optimal Brain Pruning for Large Language Models
%A Wei, Jiateng
%A Lu, Quan
%A Jiang, Ning
%A Li, Siqi
%A Xiang, Jingyang
%A Chen, Jun
%A Liu, Yong
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F wei-etal-2024-structured
%X The massive parameters and computational demands hinder the widespread application of Large Language Models (LLMs). Network pruning provides a practical solution to this problem. However, existing pruning works for LLMs mainly focus on unstructured pruning or necessitate post-pruning fine-tuning. The former relies on special hardware to accelerate computation, while the latter may need substantial computational resources. In this paper, we introduce a retraining-free structured pruning method called SoBP (Structured Optimal Brain Pruning). It leverages global first-order information to select pruning structures, then refines them with a local greedy approach, and finally adopts module-wise reconstruction to mitigate information loss. We assess the effectiveness of SoBP across 14 models from 3 LLM families on 8 distinct datasets. Experimental results demonstrate that SoBP outperforms current state-of-the-art methods.
%R 10.18653/v1/2024.emnlp-main.775
%U https://aclanthology.org/2024.emnlp-main.775
%U https://doi.org/10.18653/v1/2024.emnlp-main.775
%P 13991-14007
Markdown (Informal)
[Structured Optimal Brain Pruning for Large Language Models](https://aclanthology.org/2024.emnlp-main.775) (Wei et al., EMNLP 2024)
ACL
- Jiateng Wei, Quan Lu, Ning Jiang, Siqi Li, Jingyang Xiang, Jun Chen, and Yong Liu. 2024. Structured Optimal Brain Pruning for Large Language Models. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pages 13991–14007, Miami, Florida, USA. Association for Computational Linguistics.