@inproceedings{shen-etal-2024-learning,
title = "Learning to Decode Collaboratively with Multiple Language Models",
author = "Shen, Zejiang and
Lang, Hunter and
Wang, Bailin and
Kim, Yoon and
Sontag, David",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.acl-long.701",
doi = "10.18653/v1/2024.acl-long.701",
pages = "12974--12990",
abstract = "We propose a method to teach multiple large language models (LLM) to collaborate by interleaving their generations at the token level. We model the decision of which LLM generates the next token as a latent variable. By optimizing the marginal likelihood of a training set under our latent variable model, the base LLM automatically learns when to generate itself and when to call on one of the {``}assistant{''} language models to generate, all without direct supervision. Token-level collaboration during decoding allows for a fusion of each model{'}s expertise in a manner tailored to the specific task at hand. Our collaborative decoding is especially useful in cross-domain settings where a generalist base LLM learns to invoke domain expert models. On instruction-following, domain-specific QA, and reasoning tasks, we show that the performance of the joint system exceeds that of the individual models. Through qualitative analysis, we show models trained with our method exhibit several interesting collaboration patterns, e.g., template-filling, by visualizing the learned latent decisions.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="shen-etal-2024-learning">
<titleInfo>
<title>Learning to Decode Collaboratively with Multiple Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zejiang</namePart>
<namePart type="family">Shen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hunter</namePart>
<namePart type="family">Lang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bailin</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yoon</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Sontag</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We propose a method to teach multiple large language models (LLM) to collaborate by interleaving their generations at the token level. We model the decision of which LLM generates the next token as a latent variable. By optimizing the marginal likelihood of a training set under our latent variable model, the base LLM automatically learns when to generate itself and when to call on one of the “assistant” language models to generate, all without direct supervision. Token-level collaboration during decoding allows for a fusion of each model’s expertise in a manner tailored to the specific task at hand. Our collaborative decoding is especially useful in cross-domain settings where a generalist base LLM learns to invoke domain expert models. On instruction-following, domain-specific QA, and reasoning tasks, we show that the performance of the joint system exceeds that of the individual models. Through qualitative analysis, we show models trained with our method exhibit several interesting collaboration patterns, e.g., template-filling, by visualizing the learned latent decisions.</abstract>
<identifier type="citekey">shen-etal-2024-learning</identifier>
<identifier type="doi">10.18653/v1/2024.acl-long.701</identifier>
<location>
<url>https://aclanthology.org/2024.acl-long.701</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>12974</start>
<end>12990</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Learning to Decode Collaboratively with Multiple Language Models
%A Shen, Zejiang
%A Lang, Hunter
%A Wang, Bailin
%A Kim, Yoon
%A Sontag, David
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F shen-etal-2024-learning
%X We propose a method to teach multiple large language models (LLM) to collaborate by interleaving their generations at the token level. We model the decision of which LLM generates the next token as a latent variable. By optimizing the marginal likelihood of a training set under our latent variable model, the base LLM automatically learns when to generate itself and when to call on one of the “assistant” language models to generate, all without direct supervision. Token-level collaboration during decoding allows for a fusion of each model’s expertise in a manner tailored to the specific task at hand. Our collaborative decoding is especially useful in cross-domain settings where a generalist base LLM learns to invoke domain expert models. On instruction-following, domain-specific QA, and reasoning tasks, we show that the performance of the joint system exceeds that of the individual models. Through qualitative analysis, we show models trained with our method exhibit several interesting collaboration patterns, e.g., template-filling, by visualizing the learned latent decisions.
%R 10.18653/v1/2024.acl-long.701
%U https://aclanthology.org/2024.acl-long.701
%U https://doi.org/10.18653/v1/2024.acl-long.701
%P 12974-12990
Markdown (Informal)
[Learning to Decode Collaboratively with Multiple Language Models](https://aclanthology.org/2024.acl-long.701) (Shen et al., ACL 2024)
ACL
- Zejiang Shen, Hunter Lang, Bailin Wang, Yoon Kim, and David Sontag. 2024. Learning to Decode Collaboratively with Multiple Language Models. In Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 12974–12990, Bangkok, Thailand. Association for Computational Linguistics.