@inproceedings{wu-etal-2024-weight,
title = "Weight-Inherited Distillation for Task-Agnostic {BERT} Compression",
author = "Wu, Taiqiang and
Hou, Cheng and
Lao, Shanshan and
Li, Jiayi and
Wong, Ngai and
Zhao, Zhe and
Yang, Yujiu",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2024",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-naacl.2/",
doi = "10.18653/v1/2024.findings-naacl.2",
pages = "13--28",
abstract = "Knowledge Distillation (KD) is a predominant approach for BERT compression.Previous KD-based methods focus on designing extra alignment losses for the student model to mimic the behavior of the teacher model.These methods transfer the knowledge in an indirect way.In this paper, we propose a novel Weight-Inherited Distillation (WID), which directly transfers knowledge from the teacher.WID does not require any additional alignment loss and trains a compact student by inheriting the weights, showing a new perspective of knowledge distillation.Specifically, we design the row compactors and column compactors as mappings and then compress the weights via structural re-parameterization.Experimental results on the GLUE and SQuAD benchmarks show that WID outperforms previous state-of-the-art KD-based baselines.Further analysis indicates that WID can also learn the attention patterns from the teacher model without any alignment loss on attention distributions.The code is available at https://github.com/wutaiqiang/WID-NAACL2024."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wu-etal-2024-weight">
<titleInfo>
<title>Weight-Inherited Distillation for Task-Agnostic BERT Compression</title>
</titleInfo>
<name type="personal">
<namePart type="given">Taiqiang</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cheng</namePart>
<namePart type="family">Hou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shanshan</namePart>
<namePart type="family">Lao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiayi</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ngai</namePart>
<namePart type="family">Wong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhe</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yujiu</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Knowledge Distillation (KD) is a predominant approach for BERT compression.Previous KD-based methods focus on designing extra alignment losses for the student model to mimic the behavior of the teacher model.These methods transfer the knowledge in an indirect way.In this paper, we propose a novel Weight-Inherited Distillation (WID), which directly transfers knowledge from the teacher.WID does not require any additional alignment loss and trains a compact student by inheriting the weights, showing a new perspective of knowledge distillation.Specifically, we design the row compactors and column compactors as mappings and then compress the weights via structural re-parameterization.Experimental results on the GLUE and SQuAD benchmarks show that WID outperforms previous state-of-the-art KD-based baselines.Further analysis indicates that WID can also learn the attention patterns from the teacher model without any alignment loss on attention distributions.The code is available at https://github.com/wutaiqiang/WID-NAACL2024.</abstract>
<identifier type="citekey">wu-etal-2024-weight</identifier>
<identifier type="doi">10.18653/v1/2024.findings-naacl.2</identifier>
<location>
<url>https://aclanthology.org/2024.findings-naacl.2/</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>13</start>
<end>28</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Weight-Inherited Distillation for Task-Agnostic BERT Compression
%A Wu, Taiqiang
%A Hou, Cheng
%A Lao, Shanshan
%A Li, Jiayi
%A Wong, Ngai
%A Zhao, Zhe
%A Yang, Yujiu
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Findings of the Association for Computational Linguistics: NAACL 2024
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F wu-etal-2024-weight
%X Knowledge Distillation (KD) is a predominant approach for BERT compression.Previous KD-based methods focus on designing extra alignment losses for the student model to mimic the behavior of the teacher model.These methods transfer the knowledge in an indirect way.In this paper, we propose a novel Weight-Inherited Distillation (WID), which directly transfers knowledge from the teacher.WID does not require any additional alignment loss and trains a compact student by inheriting the weights, showing a new perspective of knowledge distillation.Specifically, we design the row compactors and column compactors as mappings and then compress the weights via structural re-parameterization.Experimental results on the GLUE and SQuAD benchmarks show that WID outperforms previous state-of-the-art KD-based baselines.Further analysis indicates that WID can also learn the attention patterns from the teacher model without any alignment loss on attention distributions.The code is available at https://github.com/wutaiqiang/WID-NAACL2024.
%R 10.18653/v1/2024.findings-naacl.2
%U https://aclanthology.org/2024.findings-naacl.2/
%U https://doi.org/10.18653/v1/2024.findings-naacl.2
%P 13-28
Markdown (Informal)
[Weight-Inherited Distillation for Task-Agnostic BERT Compression](https://aclanthology.org/2024.findings-naacl.2/) (Wu et al., Findings 2024)
ACL