@inproceedings{pan-etal-2024-human,
title = "Human-Centered Design Recommendations for {LLM}-as-a-judge",
author = "Pan, Qian and
Ashktorab, Zahra and
Desmond, Michael and
Santill{\'a}n Cooper, Mart{\'i}n and
Johnson, James and
Nair, Rahul and
Daly, Elizabeth and
Geyer, Werner",
editor = "Soni, Nikita and
Flek, Lucie and
Sharma, Ashish and
Yang, Diyi and
Hooker, Sara and
Schwartz, H. Andrew",
booktitle = "Proceedings of the 1st Human-Centered Large Language Modeling Workshop",
month = aug,
year = "2024",
address = "TBD",
publisher = "ACL",
url = "https://aclanthology.org/2024.hucllm-1.2/",
doi = "10.18653/v1/2024.hucllm-1.2",
pages = "16--29",
abstract = "Traditional reference-based metrics, such as BLEU and ROUGE, are less effective for assessing outputs from Large Language Models (LLMs) that produce highly creative or superior-quality text, or in situations where reference outputs are unavailable. While human evaluation remains an option, it is costly and difficult to scale. Recent work using LLMs as evaluators (LLM-as-a-judge) is promising, but trust and reliability remain a significant concern. Integrating human input is crucial to ensure criteria used to evaluate are aligned with the human`s intent, and evaluations are robust and consistent. This paper presents a user study of a design exploration called EvaluLLM, that enables users to leverage LLMs as customizable judges, promoting human involvement to balance trust and cost-saving potential with caution. Through interviews with eight domain experts, we identified the need for assistance in developing effective evaluation criteria aligning the LLM-as-a-judge with practitioners' preferences and expectations. We offer findings and design recommendations to optimize human-assisted LLM-as-judge systems."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pan-etal-2024-human">
<titleInfo>
<title>Human-Centered Design Recommendations for LLM-as-a-judge</title>
</titleInfo>
<name type="personal">
<namePart type="given">Qian</namePart>
<namePart type="family">Pan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zahra</namePart>
<namePart type="family">Ashktorab</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Desmond</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Martín</namePart>
<namePart type="family">Santillán Cooper</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">James</namePart>
<namePart type="family">Johnson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rahul</namePart>
<namePart type="family">Nair</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elizabeth</namePart>
<namePart type="family">Daly</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Werner</namePart>
<namePart type="family">Geyer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Human-Centered Large Language Modeling Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nikita</namePart>
<namePart type="family">Soni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucie</namePart>
<namePart type="family">Flek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ashish</namePart>
<namePart type="family">Sharma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diyi</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Hooker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">H</namePart>
<namePart type="given">Andrew</namePart>
<namePart type="family">Schwartz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ACL</publisher>
<place>
<placeTerm type="text">TBD</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Traditional reference-based metrics, such as BLEU and ROUGE, are less effective for assessing outputs from Large Language Models (LLMs) that produce highly creative or superior-quality text, or in situations where reference outputs are unavailable. While human evaluation remains an option, it is costly and difficult to scale. Recent work using LLMs as evaluators (LLM-as-a-judge) is promising, but trust and reliability remain a significant concern. Integrating human input is crucial to ensure criteria used to evaluate are aligned with the human‘s intent, and evaluations are robust and consistent. This paper presents a user study of a design exploration called EvaluLLM, that enables users to leverage LLMs as customizable judges, promoting human involvement to balance trust and cost-saving potential with caution. Through interviews with eight domain experts, we identified the need for assistance in developing effective evaluation criteria aligning the LLM-as-a-judge with practitioners’ preferences and expectations. We offer findings and design recommendations to optimize human-assisted LLM-as-judge systems.</abstract>
<identifier type="citekey">pan-etal-2024-human</identifier>
<identifier type="doi">10.18653/v1/2024.hucllm-1.2</identifier>
<location>
<url>https://aclanthology.org/2024.hucllm-1.2/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>16</start>
<end>29</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Human-Centered Design Recommendations for LLM-as-a-judge
%A Pan, Qian
%A Ashktorab, Zahra
%A Desmond, Michael
%A Santillán Cooper, Martín
%A Johnson, James
%A Nair, Rahul
%A Daly, Elizabeth
%A Geyer, Werner
%Y Soni, Nikita
%Y Flek, Lucie
%Y Sharma, Ashish
%Y Yang, Diyi
%Y Hooker, Sara
%Y Schwartz, H. Andrew
%S Proceedings of the 1st Human-Centered Large Language Modeling Workshop
%D 2024
%8 August
%I ACL
%C TBD
%F pan-etal-2024-human
%X Traditional reference-based metrics, such as BLEU and ROUGE, are less effective for assessing outputs from Large Language Models (LLMs) that produce highly creative or superior-quality text, or in situations where reference outputs are unavailable. While human evaluation remains an option, it is costly and difficult to scale. Recent work using LLMs as evaluators (LLM-as-a-judge) is promising, but trust and reliability remain a significant concern. Integrating human input is crucial to ensure criteria used to evaluate are aligned with the human‘s intent, and evaluations are robust and consistent. This paper presents a user study of a design exploration called EvaluLLM, that enables users to leverage LLMs as customizable judges, promoting human involvement to balance trust and cost-saving potential with caution. Through interviews with eight domain experts, we identified the need for assistance in developing effective evaluation criteria aligning the LLM-as-a-judge with practitioners’ preferences and expectations. We offer findings and design recommendations to optimize human-assisted LLM-as-judge systems.
%R 10.18653/v1/2024.hucllm-1.2
%U https://aclanthology.org/2024.hucllm-1.2/
%U https://doi.org/10.18653/v1/2024.hucllm-1.2
%P 16-29
Markdown (Informal)
[Human-Centered Design Recommendations for LLM-as-a-judge](https://aclanthology.org/2024.hucllm-1.2/) (Pan et al., HuCLLM 2024)
ACL
- Qian Pan, Zahra Ashktorab, Michael Desmond, Martín Santillán Cooper, James Johnson, Rahul Nair, Elizabeth Daly, and Werner Geyer. 2024. Human-Centered Design Recommendations for LLM-as-a-judge. In Proceedings of the 1st Human-Centered Large Language Modeling Workshop, pages 16–29, TBD. ACL.