@inproceedings{huang-etal-2024-self,
title = "Self-Evaluation of Large Language Model based on Glass-box Features",
author = "Huang, Hui and
Qu, Yingqi and
Liu, Jing and
Yang, Muyun and
Xu, Bing and
Zhao, Tiejun and
Lu, Wenpeng",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.333/",
doi = "10.18653/v1/2024.findings-emnlp.333",
pages = "5813--5820",
abstract = "The proliferation of open-source Large Language Models (LLMs) underscores the pressing need for evaluation methods. Existing works primarily rely on external evaluators, focusing on training and prompting strategies. However, a crucial aspect {--} model-aware glass-box features {--} is overlooked. In this study, we explore the utility of glass-box features under the scenario of self-evaluation, namely applying an LLM to evaluate its own output. We investigate various glass-box feature groups and discovered that the softmax distribution serves as a reliable quality indicator for self-evaluation. Experimental results on public benchmarks validate the feasibility of self-evaluation of LLMs using glass-box features."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="huang-etal-2024-self">
<titleInfo>
<title>Self-Evaluation of Large Language Model based on Glass-box Features</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hui</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yingqi</namePart>
<namePart type="family">Qu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Muyun</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bing</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tiejun</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenpeng</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The proliferation of open-source Large Language Models (LLMs) underscores the pressing need for evaluation methods. Existing works primarily rely on external evaluators, focusing on training and prompting strategies. However, a crucial aspect – model-aware glass-box features – is overlooked. In this study, we explore the utility of glass-box features under the scenario of self-evaluation, namely applying an LLM to evaluate its own output. We investigate various glass-box feature groups and discovered that the softmax distribution serves as a reliable quality indicator for self-evaluation. Experimental results on public benchmarks validate the feasibility of self-evaluation of LLMs using glass-box features.</abstract>
<identifier type="citekey">huang-etal-2024-self</identifier>
<identifier type="doi">10.18653/v1/2024.findings-emnlp.333</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.333/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>5813</start>
<end>5820</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Self-Evaluation of Large Language Model based on Glass-box Features
%A Huang, Hui
%A Qu, Yingqi
%A Liu, Jing
%A Yang, Muyun
%A Xu, Bing
%A Zhao, Tiejun
%A Lu, Wenpeng
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F huang-etal-2024-self
%X The proliferation of open-source Large Language Models (LLMs) underscores the pressing need for evaluation methods. Existing works primarily rely on external evaluators, focusing on training and prompting strategies. However, a crucial aspect – model-aware glass-box features – is overlooked. In this study, we explore the utility of glass-box features under the scenario of self-evaluation, namely applying an LLM to evaluate its own output. We investigate various glass-box feature groups and discovered that the softmax distribution serves as a reliable quality indicator for self-evaluation. Experimental results on public benchmarks validate the feasibility of self-evaluation of LLMs using glass-box features.
%R 10.18653/v1/2024.findings-emnlp.333
%U https://aclanthology.org/2024.findings-emnlp.333/
%U https://doi.org/10.18653/v1/2024.findings-emnlp.333
%P 5813-5820
Markdown (Informal)
[Self-Evaluation of Large Language Model based on Glass-box Features](https://aclanthology.org/2024.findings-emnlp.333/) (Huang et al., Findings 2024)
ACL
- Hui Huang, Yingqi Qu, Jing Liu, Muyun Yang, Bing Xu, Tiejun Zhao, and Wenpeng Lu. 2024. Self-Evaluation of Large Language Model based on Glass-box Features. In Findings of the Association for Computational Linguistics: EMNLP 2024, pages 5813–5820, Miami, Florida, USA. Association for Computational Linguistics.