@inproceedings{gioacchini-etal-2024-agentquest,
title = "{A}gent{Q}uest: A Modular Benchmark Framework to Measure Progress and Improve {LLM} Agents",
author = "Gioacchini, Luca and
Siracusano, Giuseppe and
Sanvito, Davide and
Gashteovski, Kiril and
Friede, David and
Bifulco, Roberto and
Lawrence, Carolin",
editor = "Chang, Kai-Wei and
Lee, Annie and
Rajani, Nazneen",
booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 3: System Demonstrations)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.naacl-demo.19/",
doi = "10.18653/v1/2024.naacl-demo.19",
pages = "185--193",
abstract = "The advances made by Large Language Models (LLMs) have led to the pursuit of LLM agents that can solve intricate, multi-step reasoning tasks. As with any research pursuit, benchmarking and evaluation are key corner stones to efficient and reliable progress. However, existing benchmarks are often narrow and simply compute overall task success. To face these issues, we propose AgentQuest {--} a framework where (i) both benchmarks and metrics are modular and easily extensible through well documented and easy-to-use APIs; (ii) we offer two new evaluation metrics that can reliably track LLM agent progress while solving a task. We exemplify the utility of the metrics on two use cases wherein we identify common failure points and refine the agent architecture to obtain a significant performance increase. Together with the research community, we hope to extend AgentQuest further and therefore we make it available under https://github.com/nec-research/agentquest."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gioacchini-etal-2024-agentquest">
<titleInfo>
<title>AgentQuest: A Modular Benchmark Framework to Measure Progress and Improve LLM Agents</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luca</namePart>
<namePart type="family">Gioacchini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Giuseppe</namePart>
<namePart type="family">Siracusano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Davide</namePart>
<namePart type="family">Sanvito</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kiril</namePart>
<namePart type="family">Gashteovski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Friede</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roberto</namePart>
<namePart type="family">Bifulco</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolin</namePart>
<namePart type="family">Lawrence</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 3: System Demonstrations)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kai-Wei</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Annie</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nazneen</namePart>
<namePart type="family">Rajani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The advances made by Large Language Models (LLMs) have led to the pursuit of LLM agents that can solve intricate, multi-step reasoning tasks. As with any research pursuit, benchmarking and evaluation are key corner stones to efficient and reliable progress. However, existing benchmarks are often narrow and simply compute overall task success. To face these issues, we propose AgentQuest – a framework where (i) both benchmarks and metrics are modular and easily extensible through well documented and easy-to-use APIs; (ii) we offer two new evaluation metrics that can reliably track LLM agent progress while solving a task. We exemplify the utility of the metrics on two use cases wherein we identify common failure points and refine the agent architecture to obtain a significant performance increase. Together with the research community, we hope to extend AgentQuest further and therefore we make it available under https://github.com/nec-research/agentquest.</abstract>
<identifier type="citekey">gioacchini-etal-2024-agentquest</identifier>
<identifier type="doi">10.18653/v1/2024.naacl-demo.19</identifier>
<location>
<url>https://aclanthology.org/2024.naacl-demo.19/</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>185</start>
<end>193</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T AgentQuest: A Modular Benchmark Framework to Measure Progress and Improve LLM Agents
%A Gioacchini, Luca
%A Siracusano, Giuseppe
%A Sanvito, Davide
%A Gashteovski, Kiril
%A Friede, David
%A Bifulco, Roberto
%A Lawrence, Carolin
%Y Chang, Kai-Wei
%Y Lee, Annie
%Y Rajani, Nazneen
%S Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 3: System Demonstrations)
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F gioacchini-etal-2024-agentquest
%X The advances made by Large Language Models (LLMs) have led to the pursuit of LLM agents that can solve intricate, multi-step reasoning tasks. As with any research pursuit, benchmarking and evaluation are key corner stones to efficient and reliable progress. However, existing benchmarks are often narrow and simply compute overall task success. To face these issues, we propose AgentQuest – a framework where (i) both benchmarks and metrics are modular and easily extensible through well documented and easy-to-use APIs; (ii) we offer two new evaluation metrics that can reliably track LLM agent progress while solving a task. We exemplify the utility of the metrics on two use cases wherein we identify common failure points and refine the agent architecture to obtain a significant performance increase. Together with the research community, we hope to extend AgentQuest further and therefore we make it available under https://github.com/nec-research/agentquest.
%R 10.18653/v1/2024.naacl-demo.19
%U https://aclanthology.org/2024.naacl-demo.19/
%U https://doi.org/10.18653/v1/2024.naacl-demo.19
%P 185-193
Markdown (Informal)
[AgentQuest: A Modular Benchmark Framework to Measure Progress and Improve LLM Agents](https://aclanthology.org/2024.naacl-demo.19/) (Gioacchini et al., NAACL 2024)
ACL
- Luca Gioacchini, Giuseppe Siracusano, Davide Sanvito, Kiril Gashteovski, David Friede, Roberto Bifulco, and Carolin Lawrence. 2024. AgentQuest: A Modular Benchmark Framework to Measure Progress and Improve LLM Agents. In Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 3: System Demonstrations), pages 185–193, Mexico City, Mexico. Association for Computational Linguistics.