@inproceedings{saad-falcon-etal-2023-embedding,
title = "Embedding Recycling for Language Models",
author = "Saad-Falcon, Jon and
Singh, Amanpreet and
Soldaini, Luca and
D{'}Arcy, Mike and
Cohan, Arman and
Downey, Doug",
editor = "Vlachos, Andreas and
Augenstein, Isabelle",
booktitle = "Findings of the Association for Computational Linguistics: EACL 2023",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-eacl.145",
doi = "10.18653/v1/2023.findings-eacl.145",
pages = "1933--1953",
abstract = "Real-world applications of neural language models often involve running many different models over the same corpus. The high computational cost of these runs has led to interest in techniques that can reuse the contextualized embeddings produced in previous runs to speed training and inference of future ones. We refer to this approach as embedding recycling (ER). While multiple ER techniques have been proposed, their practical effectiveness is still unknown because existing evaluations consider very few models and do not adequately account for overhead costs. We perform an extensive evaluation of ER across eight different models (17 to 900 million parameters) and fourteen tasks in English. We show how a simple ER technique that caches activations from an intermediate layer of a pretrained model, and learns task-specific adapters on the later layers, is broadly effective. For the best-performing baseline in our experiments (DeBERTa-v2 XL), adding a precomputed cache results in a 90{\%} speedup during training and 87-91{\%} speedup for inference, with negligible impact on accuracy. Our analysis reveals important areas of future work.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="saad-falcon-etal-2023-embedding">
<titleInfo>
<title>Embedding Recycling for Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jon</namePart>
<namePart type="family">Saad-Falcon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amanpreet</namePart>
<namePart type="family">Singh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luca</namePart>
<namePart type="family">Soldaini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mike</namePart>
<namePart type="family">D’Arcy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arman</namePart>
<namePart type="family">Cohan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Doug</namePart>
<namePart type="family">Downey</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Vlachos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isabelle</namePart>
<namePart type="family">Augenstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dubrovnik, Croatia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Real-world applications of neural language models often involve running many different models over the same corpus. The high computational cost of these runs has led to interest in techniques that can reuse the contextualized embeddings produced in previous runs to speed training and inference of future ones. We refer to this approach as embedding recycling (ER). While multiple ER techniques have been proposed, their practical effectiveness is still unknown because existing evaluations consider very few models and do not adequately account for overhead costs. We perform an extensive evaluation of ER across eight different models (17 to 900 million parameters) and fourteen tasks in English. We show how a simple ER technique that caches activations from an intermediate layer of a pretrained model, and learns task-specific adapters on the later layers, is broadly effective. For the best-performing baseline in our experiments (DeBERTa-v2 XL), adding a precomputed cache results in a 90% speedup during training and 87-91% speedup for inference, with negligible impact on accuracy. Our analysis reveals important areas of future work.</abstract>
<identifier type="citekey">saad-falcon-etal-2023-embedding</identifier>
<identifier type="doi">10.18653/v1/2023.findings-eacl.145</identifier>
<location>
<url>https://aclanthology.org/2023.findings-eacl.145</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>1933</start>
<end>1953</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Embedding Recycling for Language Models
%A Saad-Falcon, Jon
%A Singh, Amanpreet
%A Soldaini, Luca
%A D’Arcy, Mike
%A Cohan, Arman
%A Downey, Doug
%Y Vlachos, Andreas
%Y Augenstein, Isabelle
%S Findings of the Association for Computational Linguistics: EACL 2023
%D 2023
%8 May
%I Association for Computational Linguistics
%C Dubrovnik, Croatia
%F saad-falcon-etal-2023-embedding
%X Real-world applications of neural language models often involve running many different models over the same corpus. The high computational cost of these runs has led to interest in techniques that can reuse the contextualized embeddings produced in previous runs to speed training and inference of future ones. We refer to this approach as embedding recycling (ER). While multiple ER techniques have been proposed, their practical effectiveness is still unknown because existing evaluations consider very few models and do not adequately account for overhead costs. We perform an extensive evaluation of ER across eight different models (17 to 900 million parameters) and fourteen tasks in English. We show how a simple ER technique that caches activations from an intermediate layer of a pretrained model, and learns task-specific adapters on the later layers, is broadly effective. For the best-performing baseline in our experiments (DeBERTa-v2 XL), adding a precomputed cache results in a 90% speedup during training and 87-91% speedup for inference, with negligible impact on accuracy. Our analysis reveals important areas of future work.
%R 10.18653/v1/2023.findings-eacl.145
%U https://aclanthology.org/2023.findings-eacl.145
%U https://doi.org/10.18653/v1/2023.findings-eacl.145
%P 1933-1953
Markdown (Informal)
[Embedding Recycling for Language Models](https://aclanthology.org/2023.findings-eacl.145) (Saad-Falcon et al., Findings 2023)
ACL
- Jon Saad-Falcon, Amanpreet Singh, Luca Soldaini, Mike D’Arcy, Arman Cohan, and Doug Downey. 2023. Embedding Recycling for Language Models. In Findings of the Association for Computational Linguistics: EACL 2023, pages 1933–1953, Dubrovnik, Croatia. Association for Computational Linguistics.