@inproceedings{chen-etal-2024-reconfidencing,
title = "Reconfidencing {LLM}s from the Grouping Loss Perspective",
author = {Chen, Lihu and
Perez-Lebel, Alexandre and
Suchanek, Fabian M. and
Varoquaux, Ga{\"e}l},
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.85/",
doi = "10.18653/v1/2024.findings-emnlp.85",
pages = "1567--1581",
abstract = "Large Language Models (LLMs), such as GPT and LLaMA, are susceptible to generating hallucinated answers in a confident tone. While previous efforts to elicit and calibrate confidence scores have shown some success, they often overlook biases towards certain groups, such as specific nationalities. Existing calibration methods typically focus on average performance, failing to address this disparity. In our study, we demonstrate that the concept of grouping loss is an effective metric for understanding and correcting the heterogeneity in confidence levels. We introduce a novel evaluation dataset, derived from a knowledge base, specifically designed to assess the confidence scores of LLM responses across different groups. Our experimental results highlight significant variations in confidence, which are accurately captured by grouping loss. To tackle this issue, we propose a new method to calibrate the confidence scores of LLMs by considering different groups, a process we term \textit{reconfidencing}. Our findings indicate that this approach effectively mitigates biases against minority groups, contributing to the development of fairer LLMs."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chen-etal-2024-reconfidencing">
<titleInfo>
<title>Reconfidencing LLMs from the Grouping Loss Perspective</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lihu</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexandre</namePart>
<namePart type="family">Perez-Lebel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fabian</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Suchanek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gaël</namePart>
<namePart type="family">Varoquaux</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large Language Models (LLMs), such as GPT and LLaMA, are susceptible to generating hallucinated answers in a confident tone. While previous efforts to elicit and calibrate confidence scores have shown some success, they often overlook biases towards certain groups, such as specific nationalities. Existing calibration methods typically focus on average performance, failing to address this disparity. In our study, we demonstrate that the concept of grouping loss is an effective metric for understanding and correcting the heterogeneity in confidence levels. We introduce a novel evaluation dataset, derived from a knowledge base, specifically designed to assess the confidence scores of LLM responses across different groups. Our experimental results highlight significant variations in confidence, which are accurately captured by grouping loss. To tackle this issue, we propose a new method to calibrate the confidence scores of LLMs by considering different groups, a process we term reconfidencing. Our findings indicate that this approach effectively mitigates biases against minority groups, contributing to the development of fairer LLMs.</abstract>
<identifier type="citekey">chen-etal-2024-reconfidencing</identifier>
<identifier type="doi">10.18653/v1/2024.findings-emnlp.85</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.85/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>1567</start>
<end>1581</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Reconfidencing LLMs from the Grouping Loss Perspective
%A Chen, Lihu
%A Perez-Lebel, Alexandre
%A Suchanek, Fabian M.
%A Varoquaux, Gaël
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F chen-etal-2024-reconfidencing
%X Large Language Models (LLMs), such as GPT and LLaMA, are susceptible to generating hallucinated answers in a confident tone. While previous efforts to elicit and calibrate confidence scores have shown some success, they often overlook biases towards certain groups, such as specific nationalities. Existing calibration methods typically focus on average performance, failing to address this disparity. In our study, we demonstrate that the concept of grouping loss is an effective metric for understanding and correcting the heterogeneity in confidence levels. We introduce a novel evaluation dataset, derived from a knowledge base, specifically designed to assess the confidence scores of LLM responses across different groups. Our experimental results highlight significant variations in confidence, which are accurately captured by grouping loss. To tackle this issue, we propose a new method to calibrate the confidence scores of LLMs by considering different groups, a process we term reconfidencing. Our findings indicate that this approach effectively mitigates biases against minority groups, contributing to the development of fairer LLMs.
%R 10.18653/v1/2024.findings-emnlp.85
%U https://aclanthology.org/2024.findings-emnlp.85/
%U https://doi.org/10.18653/v1/2024.findings-emnlp.85
%P 1567-1581
Markdown (Informal)
[Reconfidencing LLMs from the Grouping Loss Perspective](https://aclanthology.org/2024.findings-emnlp.85/) (Chen et al., Findings 2024)
ACL
- Lihu Chen, Alexandre Perez-Lebel, Fabian M. Suchanek, and Gaël Varoquaux. 2024. Reconfidencing LLMs from the Grouping Loss Perspective. In Findings of the Association for Computational Linguistics: EMNLP 2024, pages 1567–1581, Miami, Florida, USA. Association for Computational Linguistics.