@inproceedings{efrat-etal-2023-lmentry,
title = "{LM}entry: A Language Model Benchmark of Elementary Language Tasks",
author = "Efrat, Avia and
Honovich, Or and
Levy, Omer",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-acl.666",
doi = "10.18653/v1/2023.findings-acl.666",
pages = "10476--10501",
abstract = "As the performance of large language models rapidly improves, benchmarks are getting larger and more complex as well. We present LMentry, a benchmark that avoids this {``}arms race{''} by focusing on a compact set of tasks that are trivial to humans, e.g. writing a sentence containing a specific word, identifying which words in a list belong to a specific category, or choosing which of two words is longer.LMentry is specifically designed to provide quick and interpretable insights into the capabilities and robustness of large language models. Our experiments reveal a wide variety of failure cases that, while immediately obvious to humans, pose a considerable challenge for large language models, including OpenAI{'}s latest 175B-parameter instruction-tuned model, TextDavinci002.LMentry complements contemporary evaluation approaches of large language models, providing a quick, automatic, and easy-to-run {``}unit test{''}, without resorting to large benchmark suites of complex tasks.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="efrat-etal-2023-lmentry">
<titleInfo>
<title>LMentry: A Language Model Benchmark of Elementary Language Tasks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Avia</namePart>
<namePart type="family">Efrat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Or</namePart>
<namePart type="family">Honovich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Omer</namePart>
<namePart type="family">Levy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>As the performance of large language models rapidly improves, benchmarks are getting larger and more complex as well. We present LMentry, a benchmark that avoids this “arms race” by focusing on a compact set of tasks that are trivial to humans, e.g. writing a sentence containing a specific word, identifying which words in a list belong to a specific category, or choosing which of two words is longer.LMentry is specifically designed to provide quick and interpretable insights into the capabilities and robustness of large language models. Our experiments reveal a wide variety of failure cases that, while immediately obvious to humans, pose a considerable challenge for large language models, including OpenAI’s latest 175B-parameter instruction-tuned model, TextDavinci002.LMentry complements contemporary evaluation approaches of large language models, providing a quick, automatic, and easy-to-run “unit test”, without resorting to large benchmark suites of complex tasks.</abstract>
<identifier type="citekey">efrat-etal-2023-lmentry</identifier>
<identifier type="doi">10.18653/v1/2023.findings-acl.666</identifier>
<location>
<url>https://aclanthology.org/2023.findings-acl.666</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>10476</start>
<end>10501</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LMentry: A Language Model Benchmark of Elementary Language Tasks
%A Efrat, Avia
%A Honovich, Or
%A Levy, Omer
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Findings of the Association for Computational Linguistics: ACL 2023
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F efrat-etal-2023-lmentry
%X As the performance of large language models rapidly improves, benchmarks are getting larger and more complex as well. We present LMentry, a benchmark that avoids this “arms race” by focusing on a compact set of tasks that are trivial to humans, e.g. writing a sentence containing a specific word, identifying which words in a list belong to a specific category, or choosing which of two words is longer.LMentry is specifically designed to provide quick and interpretable insights into the capabilities and robustness of large language models. Our experiments reveal a wide variety of failure cases that, while immediately obvious to humans, pose a considerable challenge for large language models, including OpenAI’s latest 175B-parameter instruction-tuned model, TextDavinci002.LMentry complements contemporary evaluation approaches of large language models, providing a quick, automatic, and easy-to-run “unit test”, without resorting to large benchmark suites of complex tasks.
%R 10.18653/v1/2023.findings-acl.666
%U https://aclanthology.org/2023.findings-acl.666
%U https://doi.org/10.18653/v1/2023.findings-acl.666
%P 10476-10501
Markdown (Informal)
[LMentry: A Language Model Benchmark of Elementary Language Tasks](https://aclanthology.org/2023.findings-acl.666) (Efrat et al., Findings 2023)
ACL