@inproceedings{yu-etal-2024-rethinking,
title = "Rethinking the Evaluation of In-Context Learning for {LLM}s",
author = "Yu, Guoxin and
Liu, Lemao and
Yu, Mo and
Yu, Yue and
Ao, Xiang",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.779/",
doi = "10.18653/v1/2024.emnlp-main.779",
pages = "14068--14082",
abstract = "In-context learning (ICL) has demonstrated excellent performance across various downstream NLP tasks, especially when synergized with powerful large language models (LLMs). Existing studies evaluate ICL methods primarily based on downstream task performance. This evaluation protocol overlooks the significant cost associated with the demonstration configuration process, i.e., tuning the demonstration as the ICL prompt. However, in this work, we point out that the evaluation protocol leads to unfair comparisons and potentially biased evaluation, because we surprisingly find the correlation between the configuration costs and task performance. Then we call for a two-dimensional evaluation paradigm that considers both of these aspects, facilitating a fairer comparison.Finally, based on our empirical finding that the optimized demonstration on one language model generalizes across language models of different sizes, we introduce a simple yet efficient strategy that can be applied to any ICL method as a plugin, yielding a better trade-off between the two dimensions according to the proposed evaluation paradigm."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yu-etal-2024-rethinking">
<titleInfo>
<title>Rethinking the Evaluation of In-Context Learning for LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Guoxin</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lemao</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mo</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiang</namePart>
<namePart type="family">Ao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In-context learning (ICL) has demonstrated excellent performance across various downstream NLP tasks, especially when synergized with powerful large language models (LLMs). Existing studies evaluate ICL methods primarily based on downstream task performance. This evaluation protocol overlooks the significant cost associated with the demonstration configuration process, i.e., tuning the demonstration as the ICL prompt. However, in this work, we point out that the evaluation protocol leads to unfair comparisons and potentially biased evaluation, because we surprisingly find the correlation between the configuration costs and task performance. Then we call for a two-dimensional evaluation paradigm that considers both of these aspects, facilitating a fairer comparison.Finally, based on our empirical finding that the optimized demonstration on one language model generalizes across language models of different sizes, we introduce a simple yet efficient strategy that can be applied to any ICL method as a plugin, yielding a better trade-off between the two dimensions according to the proposed evaluation paradigm.</abstract>
<identifier type="citekey">yu-etal-2024-rethinking</identifier>
<identifier type="doi">10.18653/v1/2024.emnlp-main.779</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.779/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>14068</start>
<end>14082</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Rethinking the Evaluation of In-Context Learning for LLMs
%A Yu, Guoxin
%A Liu, Lemao
%A Yu, Mo
%A Yu, Yue
%A Ao, Xiang
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F yu-etal-2024-rethinking
%X In-context learning (ICL) has demonstrated excellent performance across various downstream NLP tasks, especially when synergized with powerful large language models (LLMs). Existing studies evaluate ICL methods primarily based on downstream task performance. This evaluation protocol overlooks the significant cost associated with the demonstration configuration process, i.e., tuning the demonstration as the ICL prompt. However, in this work, we point out that the evaluation protocol leads to unfair comparisons and potentially biased evaluation, because we surprisingly find the correlation between the configuration costs and task performance. Then we call for a two-dimensional evaluation paradigm that considers both of these aspects, facilitating a fairer comparison.Finally, based on our empirical finding that the optimized demonstration on one language model generalizes across language models of different sizes, we introduce a simple yet efficient strategy that can be applied to any ICL method as a plugin, yielding a better trade-off between the two dimensions according to the proposed evaluation paradigm.
%R 10.18653/v1/2024.emnlp-main.779
%U https://aclanthology.org/2024.emnlp-main.779/
%U https://doi.org/10.18653/v1/2024.emnlp-main.779
%P 14068-14082
Markdown (Informal)
[Rethinking the Evaluation of In-Context Learning for LLMs](https://aclanthology.org/2024.emnlp-main.779/) (Yu et al., EMNLP 2024)
ACL
- Guoxin Yu, Lemao Liu, Mo Yu, Yue Yu, and Xiang Ao. 2024. Rethinking the Evaluation of In-Context Learning for LLMs. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pages 14068–14082, Miami, Florida, USA. Association for Computational Linguistics.