@inproceedings{lin-etal-2024-navigating-noisy,
title = "Navigating Noisy Feedback: Enhancing Reinforcement Learning with Error-Prone Language Models",
author = "Lin, Muhan and
Shi, Shuyang and
Guo, Yue and
Chalaki, Behdad and
Tadiparthi, Vaishnav and
Moradi Pari, Ehsan and
Stepputtis, Simon and
Campbell, Joseph and
Sycara, Katia P.",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.939/",
doi = "10.18653/v1/2024.findings-emnlp.939",
pages = "16002--16014",
abstract = "The correct specification of reward models is a well-known challenge in reinforcement learning.Hand-crafted reward functions often lead to inefficient or suboptimal policies and may not be aligned with user values.Reinforcement learning from human feedback is a successful technique that can mitigate such issues, however, the collection of human feedback can be laborious.Recent works have solicited feedback from pre-trained large language models rather than humans to reduce or eliminate human effort, however, these approaches yield poor performance in the presence of hallucination and other errors.This paper studies the advantages and limitations of reinforcement learning from large language model feedback and proposes a simple yet effective method for soliciting and applying feedback as a potential-based shaping function.We theoretically show that inconsistent rankings {--} which approximate ranking errors {--} lead to uninformative rewards with our approach. Our method empirically improves convergence speed and policy returns over commonly used baselines even with significant ranking errors, and eliminates the need for complex post-processing of reward functions."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lin-etal-2024-navigating-noisy">
<titleInfo>
<title>Navigating Noisy Feedback: Enhancing Reinforcement Learning with Error-Prone Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Muhan</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuyang</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Behdad</namePart>
<namePart type="family">Chalaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vaishnav</namePart>
<namePart type="family">Tadiparthi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ehsan</namePart>
<namePart type="family">Moradi Pari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Stepputtis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Campbell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Katia</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Sycara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The correct specification of reward models is a well-known challenge in reinforcement learning.Hand-crafted reward functions often lead to inefficient or suboptimal policies and may not be aligned with user values.Reinforcement learning from human feedback is a successful technique that can mitigate such issues, however, the collection of human feedback can be laborious.Recent works have solicited feedback from pre-trained large language models rather than humans to reduce or eliminate human effort, however, these approaches yield poor performance in the presence of hallucination and other errors.This paper studies the advantages and limitations of reinforcement learning from large language model feedback and proposes a simple yet effective method for soliciting and applying feedback as a potential-based shaping function.We theoretically show that inconsistent rankings – which approximate ranking errors – lead to uninformative rewards with our approach. Our method empirically improves convergence speed and policy returns over commonly used baselines even with significant ranking errors, and eliminates the need for complex post-processing of reward functions.</abstract>
<identifier type="citekey">lin-etal-2024-navigating-noisy</identifier>
<identifier type="doi">10.18653/v1/2024.findings-emnlp.939</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.939/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>16002</start>
<end>16014</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Navigating Noisy Feedback: Enhancing Reinforcement Learning with Error-Prone Language Models
%A Lin, Muhan
%A Shi, Shuyang
%A Guo, Yue
%A Chalaki, Behdad
%A Tadiparthi, Vaishnav
%A Moradi Pari, Ehsan
%A Stepputtis, Simon
%A Campbell, Joseph
%A Sycara, Katia P.
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F lin-etal-2024-navigating-noisy
%X The correct specification of reward models is a well-known challenge in reinforcement learning.Hand-crafted reward functions often lead to inefficient or suboptimal policies and may not be aligned with user values.Reinforcement learning from human feedback is a successful technique that can mitigate such issues, however, the collection of human feedback can be laborious.Recent works have solicited feedback from pre-trained large language models rather than humans to reduce or eliminate human effort, however, these approaches yield poor performance in the presence of hallucination and other errors.This paper studies the advantages and limitations of reinforcement learning from large language model feedback and proposes a simple yet effective method for soliciting and applying feedback as a potential-based shaping function.We theoretically show that inconsistent rankings – which approximate ranking errors – lead to uninformative rewards with our approach. Our method empirically improves convergence speed and policy returns over commonly used baselines even with significant ranking errors, and eliminates the need for complex post-processing of reward functions.
%R 10.18653/v1/2024.findings-emnlp.939
%U https://aclanthology.org/2024.findings-emnlp.939/
%U https://doi.org/10.18653/v1/2024.findings-emnlp.939
%P 16002-16014
Markdown (Informal)
[Navigating Noisy Feedback: Enhancing Reinforcement Learning with Error-Prone Language Models](https://aclanthology.org/2024.findings-emnlp.939/) (Lin et al., Findings 2024)
ACL
- Muhan Lin, Shuyang Shi, Yue Guo, Behdad Chalaki, Vaishnav Tadiparthi, Ehsan Moradi Pari, Simon Stepputtis, Joseph Campbell, and Katia P. Sycara. 2024. Navigating Noisy Feedback: Enhancing Reinforcement Learning with Error-Prone Language Models. In Findings of the Association for Computational Linguistics: EMNLP 2024, pages 16002–16014, Miami, Florida, USA. Association for Computational Linguistics.