@inproceedings{li-etal-2024-loogle,
title = "{L}oo{GLE}: Can Long-Context Language Models Understand Long Contexts?",
author = "Li, Jiaqi and
Wang, Mengmeng and
Zheng, Zilong and
Zhang, Muhan",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.acl-long.859",
doi = "10.18653/v1/2024.acl-long.859",
pages = "16304--16333",
abstract = "Large language models (LLMs) are typically limited to processing texts within context window size, which has spurred significant research efforts into enhancing LLMs{'} long-context understanding as well as developing high-quality benchmarks to evaluate the ability. However, prior datasets suffer from short comings like short length compared to the context window of modern LLMs; outdated documents that might have data leakage problems; and an emphasis on short dependency tasks only. In this paper, we present LooGLE , a Long Context Generic Language Evaluation benchmark. It features documents post-2022, with over 24,000 tokens per document and 6,000 newly generated questions spanning varying dependency ranges in diverse domains. Human annotators meticulously crafted over 1,100 high-quality question-answer (QA) pairs with thorough cross-validation for a most precise assessment of LLMs{'} long dependency capabilities. We conduct a comprehensive evaluation of representative LLMs on LooGLE . The results indicate that most LLMs have shockingly bad long context ability and fail to capture long dependencies in the context, even when their context window size is enough to fit the entire document. Our results shed light on enhancing the {``}true long-context understanding{''} ability of LLMs instead of merely enlarging their context window.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="li-etal-2024-loogle">
<titleInfo>
<title>LooGLE: Can Long-Context Language Models Understand Long Contexts?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jiaqi</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mengmeng</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zilong</namePart>
<namePart type="family">Zheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Muhan</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large language models (LLMs) are typically limited to processing texts within context window size, which has spurred significant research efforts into enhancing LLMs’ long-context understanding as well as developing high-quality benchmarks to evaluate the ability. However, prior datasets suffer from short comings like short length compared to the context window of modern LLMs; outdated documents that might have data leakage problems; and an emphasis on short dependency tasks only. In this paper, we present LooGLE , a Long Context Generic Language Evaluation benchmark. It features documents post-2022, with over 24,000 tokens per document and 6,000 newly generated questions spanning varying dependency ranges in diverse domains. Human annotators meticulously crafted over 1,100 high-quality question-answer (QA) pairs with thorough cross-validation for a most precise assessment of LLMs’ long dependency capabilities. We conduct a comprehensive evaluation of representative LLMs on LooGLE . The results indicate that most LLMs have shockingly bad long context ability and fail to capture long dependencies in the context, even when their context window size is enough to fit the entire document. Our results shed light on enhancing the “true long-context understanding” ability of LLMs instead of merely enlarging their context window.</abstract>
<identifier type="citekey">li-etal-2024-loogle</identifier>
<identifier type="doi">10.18653/v1/2024.acl-long.859</identifier>
<location>
<url>https://aclanthology.org/2024.acl-long.859</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>16304</start>
<end>16333</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LooGLE: Can Long-Context Language Models Understand Long Contexts?
%A Li, Jiaqi
%A Wang, Mengmeng
%A Zheng, Zilong
%A Zhang, Muhan
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F li-etal-2024-loogle
%X Large language models (LLMs) are typically limited to processing texts within context window size, which has spurred significant research efforts into enhancing LLMs’ long-context understanding as well as developing high-quality benchmarks to evaluate the ability. However, prior datasets suffer from short comings like short length compared to the context window of modern LLMs; outdated documents that might have data leakage problems; and an emphasis on short dependency tasks only. In this paper, we present LooGLE , a Long Context Generic Language Evaluation benchmark. It features documents post-2022, with over 24,000 tokens per document and 6,000 newly generated questions spanning varying dependency ranges in diverse domains. Human annotators meticulously crafted over 1,100 high-quality question-answer (QA) pairs with thorough cross-validation for a most precise assessment of LLMs’ long dependency capabilities. We conduct a comprehensive evaluation of representative LLMs on LooGLE . The results indicate that most LLMs have shockingly bad long context ability and fail to capture long dependencies in the context, even when their context window size is enough to fit the entire document. Our results shed light on enhancing the “true long-context understanding” ability of LLMs instead of merely enlarging their context window.
%R 10.18653/v1/2024.acl-long.859
%U https://aclanthology.org/2024.acl-long.859
%U https://doi.org/10.18653/v1/2024.acl-long.859
%P 16304-16333
Markdown (Informal)
[LooGLE: Can Long-Context Language Models Understand Long Contexts?](https://aclanthology.org/2024.acl-long.859) (Li et al., ACL 2024)
ACL