@inproceedings{pang-etal-2024-arm,
title = "{ARM}: Alignment with Residual Energy-Based Model",
author = "Pang, Bo and
Xiong, Caiming and
Zhou, Yingbo",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.naacl-long.455/",
doi = "10.18653/v1/2024.naacl-long.455",
pages = "8225--8236",
abstract = "While large language models (LLMs) trained with large-scale unsupervised learning acquire a wide variety of world knowledge and skills, its behavior does not necessarily align with human preferences. RLHF methods achieve successes in aligning LLM responses with human preferences and improving the controllability of LLM behavior with human instruction. However, RLHF methods are considerably complicated to implement, computationally expensive to train, and notoriously tricky to tune. In this work, we propose Alignment with Residual Energy-Based Model (ARM), as a simple and flexible alternative to RLHF methods. Our method is driven by an observation that we can learn an aligned policy by minimizing a forward Kullback{--}Leibler (KL) divergence from a target policy (in the form of a residual energy-based model) to a parameteric policy (LLM), instead of a reverse KL as in RLHF methods. With samples from the energy-based target policy, we can leverage the power of DPO (or other offline methods) to learn an aligned policy efficiently. ARM is simple to implement and applicable in various data settings. Our extensive experiments demonstrate its strong performance across multiple datasets, compared to strong baselines like PPO, DPO."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pang-etal-2024-arm">
<titleInfo>
<title>ARM: Alignment with Residual Energy-Based Model</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bo</namePart>
<namePart type="family">Pang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Caiming</namePart>
<namePart type="family">Xiong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yingbo</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>While large language models (LLMs) trained with large-scale unsupervised learning acquire a wide variety of world knowledge and skills, its behavior does not necessarily align with human preferences. RLHF methods achieve successes in aligning LLM responses with human preferences and improving the controllability of LLM behavior with human instruction. However, RLHF methods are considerably complicated to implement, computationally expensive to train, and notoriously tricky to tune. In this work, we propose Alignment with Residual Energy-Based Model (ARM), as a simple and flexible alternative to RLHF methods. Our method is driven by an observation that we can learn an aligned policy by minimizing a forward Kullback–Leibler (KL) divergence from a target policy (in the form of a residual energy-based model) to a parameteric policy (LLM), instead of a reverse KL as in RLHF methods. With samples from the energy-based target policy, we can leverage the power of DPO (or other offline methods) to learn an aligned policy efficiently. ARM is simple to implement and applicable in various data settings. Our extensive experiments demonstrate its strong performance across multiple datasets, compared to strong baselines like PPO, DPO.</abstract>
<identifier type="citekey">pang-etal-2024-arm</identifier>
<identifier type="doi">10.18653/v1/2024.naacl-long.455</identifier>
<location>
<url>https://aclanthology.org/2024.naacl-long.455/</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>8225</start>
<end>8236</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ARM: Alignment with Residual Energy-Based Model
%A Pang, Bo
%A Xiong, Caiming
%A Zhou, Yingbo
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F pang-etal-2024-arm
%X While large language models (LLMs) trained with large-scale unsupervised learning acquire a wide variety of world knowledge and skills, its behavior does not necessarily align with human preferences. RLHF methods achieve successes in aligning LLM responses with human preferences and improving the controllability of LLM behavior with human instruction. However, RLHF methods are considerably complicated to implement, computationally expensive to train, and notoriously tricky to tune. In this work, we propose Alignment with Residual Energy-Based Model (ARM), as a simple and flexible alternative to RLHF methods. Our method is driven by an observation that we can learn an aligned policy by minimizing a forward Kullback–Leibler (KL) divergence from a target policy (in the form of a residual energy-based model) to a parameteric policy (LLM), instead of a reverse KL as in RLHF methods. With samples from the energy-based target policy, we can leverage the power of DPO (or other offline methods) to learn an aligned policy efficiently. ARM is simple to implement and applicable in various data settings. Our extensive experiments demonstrate its strong performance across multiple datasets, compared to strong baselines like PPO, DPO.
%R 10.18653/v1/2024.naacl-long.455
%U https://aclanthology.org/2024.naacl-long.455/
%U https://doi.org/10.18653/v1/2024.naacl-long.455
%P 8225-8236
Markdown (Informal)
[ARM: Alignment with Residual Energy-Based Model](https://aclanthology.org/2024.naacl-long.455/) (Pang et al., NAACL 2024)
ACL
- Bo Pang, Caiming Xiong, and Yingbo Zhou. 2024. ARM: Alignment with Residual Energy-Based Model. In Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 8225–8236, Mexico City, Mexico. Association for Computational Linguistics.