@inproceedings{shu-etal-2024-dont,
title = "You don`t need a personality test to know these models are unreliable: Assessing the Reliability of Large Language Models on Psychometric Instruments",
author = "Shu, Bangzhao and
Zhang, Lechen and
Choi, Minje and
Dunagan, Lavinia and
Logeswaran, Lajanugen and
Lee, Moontae and
Card, Dallas and
Jurgens, David",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.naacl-long.295/",
doi = "10.18653/v1/2024.naacl-long.295",
pages = "5263--5281",
abstract = "The versatility of Large Language Models (LLMs) on natural language understanding tasks has made them popular for research in social sciences. To properly understand the properties and innate personas of LLMs, researchers have performed studies that involve using prompts in the form of questions that ask LLMs about particular opinions. In this study, we take a cautionary step back and examine whether the current format of prompting LLMs elicits responses in a consistent and robust manner. We first construct a dataset that contains 693 questions encompassing 39 different instruments of persona measurement on 115 persona axes. Additionally, we design a set of prompts containing minor variations and examine LLMs' capabilities to generate answers, as well as prompt variations to examine their consistency with respect to content-level variations such as switching the order of response options or negating the statement. Our experiments on 17 different LLMs reveal that even simple perturbations significantly downgrade a model`s question-answering ability, and that most LLMs have low negation consistency. Our results suggest that the currently widespread practice of prompting is insufficient to accurately and reliably capture model perceptions, and we therefore discuss potential alternatives to improve these issues."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="shu-etal-2024-dont">
<titleInfo>
<title>You don‘t need a personality test to know these models are unreliable: Assessing the Reliability of Large Language Models on Psychometric Instruments</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bangzhao</namePart>
<namePart type="family">Shu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lechen</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Minje</namePart>
<namePart type="family">Choi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lavinia</namePart>
<namePart type="family">Dunagan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lajanugen</namePart>
<namePart type="family">Logeswaran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Moontae</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dallas</namePart>
<namePart type="family">Card</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The versatility of Large Language Models (LLMs) on natural language understanding tasks has made them popular for research in social sciences. To properly understand the properties and innate personas of LLMs, researchers have performed studies that involve using prompts in the form of questions that ask LLMs about particular opinions. In this study, we take a cautionary step back and examine whether the current format of prompting LLMs elicits responses in a consistent and robust manner. We first construct a dataset that contains 693 questions encompassing 39 different instruments of persona measurement on 115 persona axes. Additionally, we design a set of prompts containing minor variations and examine LLMs’ capabilities to generate answers, as well as prompt variations to examine their consistency with respect to content-level variations such as switching the order of response options or negating the statement. Our experiments on 17 different LLMs reveal that even simple perturbations significantly downgrade a model‘s question-answering ability, and that most LLMs have low negation consistency. Our results suggest that the currently widespread practice of prompting is insufficient to accurately and reliably capture model perceptions, and we therefore discuss potential alternatives to improve these issues.</abstract>
<identifier type="citekey">shu-etal-2024-dont</identifier>
<identifier type="doi">10.18653/v1/2024.naacl-long.295</identifier>
<location>
<url>https://aclanthology.org/2024.naacl-long.295/</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>5263</start>
<end>5281</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T You don‘t need a personality test to know these models are unreliable: Assessing the Reliability of Large Language Models on Psychometric Instruments
%A Shu, Bangzhao
%A Zhang, Lechen
%A Choi, Minje
%A Dunagan, Lavinia
%A Logeswaran, Lajanugen
%A Lee, Moontae
%A Card, Dallas
%A Jurgens, David
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F shu-etal-2024-dont
%X The versatility of Large Language Models (LLMs) on natural language understanding tasks has made them popular for research in social sciences. To properly understand the properties and innate personas of LLMs, researchers have performed studies that involve using prompts in the form of questions that ask LLMs about particular opinions. In this study, we take a cautionary step back and examine whether the current format of prompting LLMs elicits responses in a consistent and robust manner. We first construct a dataset that contains 693 questions encompassing 39 different instruments of persona measurement on 115 persona axes. Additionally, we design a set of prompts containing minor variations and examine LLMs’ capabilities to generate answers, as well as prompt variations to examine their consistency with respect to content-level variations such as switching the order of response options or negating the statement. Our experiments on 17 different LLMs reveal that even simple perturbations significantly downgrade a model‘s question-answering ability, and that most LLMs have low negation consistency. Our results suggest that the currently widespread practice of prompting is insufficient to accurately and reliably capture model perceptions, and we therefore discuss potential alternatives to improve these issues.
%R 10.18653/v1/2024.naacl-long.295
%U https://aclanthology.org/2024.naacl-long.295/
%U https://doi.org/10.18653/v1/2024.naacl-long.295
%P 5263-5281
Markdown (Informal)
[You don’t need a personality test to know these models are unreliable: Assessing the Reliability of Large Language Models on Psychometric Instruments](https://aclanthology.org/2024.naacl-long.295/) (Shu et al., NAACL 2024)
ACL
- Bangzhao Shu, Lechen Zhang, Minje Choi, Lavinia Dunagan, Lajanugen Logeswaran, Moontae Lee, Dallas Card, and David Jurgens. 2024. You don’t need a personality test to know these models are unreliable: Assessing the Reliability of Large Language Models on Psychometric Instruments. In Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 5263–5281, Mexico City, Mexico. Association for Computational Linguistics.