@inproceedings{echterhoff-etal-2024-muscle,
title = "{MUSCLE}: A Model Update Strategy for Compatible {LLM} Evolution",
author = "Echterhoff, Jessica Maria and
Faghri, Fartash and
Vemulapalli, Raviteja and
Hu, Ting-Yao and
Li, Chun-Liang and
Tuzel, Oncel and
Pouransari, Hadi",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.430/",
doi = "10.18653/v1/2024.findings-emnlp.430",
pages = "7320--7332",
abstract = "Large Language Models (LLMs) are regularly updated to enhance performance, typically through changes in data or architecture. Within the update process, developers often prioritize improving overall performance metrics, paying less attention to maintaining compatibility with earlier model versions. Instance-level degradation (instance regression) of performance from one model version to the next can interfere with a user`s mental model of the capabilities of a particular language model. Users having to adapt their mental model with every update can lead to dissatisfaction, especially when the new model has degraded compared to a prior version for a known use case (model update regression).We find that when pretrained LLM base models are updated, fine-tuned user-facing downstream task adapters experience negative flips {--} previously correct instances are now predicted incorrectly. We observe model update regression between different model versions on a diverse set of tasks and models, even when the downstream task training procedures remain identical. We argue for the importance of maintaining model update compatibility during updates, and present evaluation metrics designed specifically for generative tasks, while also being applicable to discriminative tasks. We propose a training strategy to minimize the extent of instance regression in model updates, involving training of a compatibility adapter that can enhance task fine-tuned language models. We show negative flips reduce by up to 40{\%} e.g. when updating Llama 1 to Llama 2 with our proposed method."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="echterhoff-etal-2024-muscle">
<titleInfo>
<title>MUSCLE: A Model Update Strategy for Compatible LLM Evolution</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jessica</namePart>
<namePart type="given">Maria</namePart>
<namePart type="family">Echterhoff</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fartash</namePart>
<namePart type="family">Faghri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raviteja</namePart>
<namePart type="family">Vemulapalli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ting-Yao</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chun-Liang</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oncel</namePart>
<namePart type="family">Tuzel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hadi</namePart>
<namePart type="family">Pouransari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large Language Models (LLMs) are regularly updated to enhance performance, typically through changes in data or architecture. Within the update process, developers often prioritize improving overall performance metrics, paying less attention to maintaining compatibility with earlier model versions. Instance-level degradation (instance regression) of performance from one model version to the next can interfere with a user‘s mental model of the capabilities of a particular language model. Users having to adapt their mental model with every update can lead to dissatisfaction, especially when the new model has degraded compared to a prior version for a known use case (model update regression).We find that when pretrained LLM base models are updated, fine-tuned user-facing downstream task adapters experience negative flips – previously correct instances are now predicted incorrectly. We observe model update regression between different model versions on a diverse set of tasks and models, even when the downstream task training procedures remain identical. We argue for the importance of maintaining model update compatibility during updates, and present evaluation metrics designed specifically for generative tasks, while also being applicable to discriminative tasks. We propose a training strategy to minimize the extent of instance regression in model updates, involving training of a compatibility adapter that can enhance task fine-tuned language models. We show negative flips reduce by up to 40% e.g. when updating Llama 1 to Llama 2 with our proposed method.</abstract>
<identifier type="citekey">echterhoff-etal-2024-muscle</identifier>
<identifier type="doi">10.18653/v1/2024.findings-emnlp.430</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.430/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>7320</start>
<end>7332</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MUSCLE: A Model Update Strategy for Compatible LLM Evolution
%A Echterhoff, Jessica Maria
%A Faghri, Fartash
%A Vemulapalli, Raviteja
%A Hu, Ting-Yao
%A Li, Chun-Liang
%A Tuzel, Oncel
%A Pouransari, Hadi
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F echterhoff-etal-2024-muscle
%X Large Language Models (LLMs) are regularly updated to enhance performance, typically through changes in data or architecture. Within the update process, developers often prioritize improving overall performance metrics, paying less attention to maintaining compatibility with earlier model versions. Instance-level degradation (instance regression) of performance from one model version to the next can interfere with a user‘s mental model of the capabilities of a particular language model. Users having to adapt their mental model with every update can lead to dissatisfaction, especially when the new model has degraded compared to a prior version for a known use case (model update regression).We find that when pretrained LLM base models are updated, fine-tuned user-facing downstream task adapters experience negative flips – previously correct instances are now predicted incorrectly. We observe model update regression between different model versions on a diverse set of tasks and models, even when the downstream task training procedures remain identical. We argue for the importance of maintaining model update compatibility during updates, and present evaluation metrics designed specifically for generative tasks, while also being applicable to discriminative tasks. We propose a training strategy to minimize the extent of instance regression in model updates, involving training of a compatibility adapter that can enhance task fine-tuned language models. We show negative flips reduce by up to 40% e.g. when updating Llama 1 to Llama 2 with our proposed method.
%R 10.18653/v1/2024.findings-emnlp.430
%U https://aclanthology.org/2024.findings-emnlp.430/
%U https://doi.org/10.18653/v1/2024.findings-emnlp.430
%P 7320-7332
Markdown (Informal)
[MUSCLE: A Model Update Strategy for Compatible LLM Evolution](https://aclanthology.org/2024.findings-emnlp.430/) (Echterhoff et al., Findings 2024)
ACL
- Jessica Maria Echterhoff, Fartash Faghri, Raviteja Vemulapalli, Ting-Yao Hu, Chun-Liang Li, Oncel Tuzel, and Hadi Pouransari. 2024. MUSCLE: A Model Update Strategy for Compatible LLM Evolution. In Findings of the Association for Computational Linguistics: EMNLP 2024, pages 7320–7332, Miami, Florida, USA. Association for Computational Linguistics.