@inproceedings{xia-etal-2024-fofo,
title = "{FOFO}: A Benchmark to Evaluate {LLM}s{'} Format-Following Capability",
author = "Xia, Congying and
Xing, Chen and
Du, Jiangshu and
Yang, Xinyi and
Feng, Yihao and
Xu, Ran and
Yin, Wenpeng and
Xiong, Caiming",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.acl-long.40",
doi = "10.18653/v1/2024.acl-long.40",
pages = "680--699",
abstract = "This paper presents FoFo, a pioneering benchmark for evaluating large language models{'} (LLMs) ability to follow complex, domain-specific formats, a crucial yet under-examined capability for their application as AI agents. Despite LLMs{'} advancements, existing benchmarks fail to assess their format-following proficiency adequately. FoFo fills this gap with a diverse range of real-world formats and instructions, developed through an AI-Human collaborative method. Our evaluation across both open-source (e.g., Llama 2, WizardLM) and closed-source (e.g., GPT-4, PALM2, Gemini) LLMs highlights three key findings: open-source models significantly lag behind closed-source ones in format adherence; LLMs{'} format-following performance is independent of their content generation quality; and LLMs{'} format proficiency varies across different domains. These insights suggest the need for specialized tuning for format-following skills and highlight FoFo{'}s role in guiding the selection of domain-specific AI agents. FoFo will be publicly released, contributing a critical tool for advancing LLM evaluation and application.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="xia-etal-2024-fofo">
<titleInfo>
<title>FOFO: A Benchmark to Evaluate LLMs’ Format-Following Capability</title>
</titleInfo>
<name type="personal">
<namePart type="given">Congying</namePart>
<namePart type="family">Xia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chen</namePart>
<namePart type="family">Xing</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiangshu</namePart>
<namePart type="family">Du</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xinyi</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yihao</namePart>
<namePart type="family">Feng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ran</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenpeng</namePart>
<namePart type="family">Yin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Caiming</namePart>
<namePart type="family">Xiong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper presents FoFo, a pioneering benchmark for evaluating large language models’ (LLMs) ability to follow complex, domain-specific formats, a crucial yet under-examined capability for their application as AI agents. Despite LLMs’ advancements, existing benchmarks fail to assess their format-following proficiency adequately. FoFo fills this gap with a diverse range of real-world formats and instructions, developed through an AI-Human collaborative method. Our evaluation across both open-source (e.g., Llama 2, WizardLM) and closed-source (e.g., GPT-4, PALM2, Gemini) LLMs highlights three key findings: open-source models significantly lag behind closed-source ones in format adherence; LLMs’ format-following performance is independent of their content generation quality; and LLMs’ format proficiency varies across different domains. These insights suggest the need for specialized tuning for format-following skills and highlight FoFo’s role in guiding the selection of domain-specific AI agents. FoFo will be publicly released, contributing a critical tool for advancing LLM evaluation and application.</abstract>
<identifier type="citekey">xia-etal-2024-fofo</identifier>
<identifier type="doi">10.18653/v1/2024.acl-long.40</identifier>
<location>
<url>https://aclanthology.org/2024.acl-long.40</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>680</start>
<end>699</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T FOFO: A Benchmark to Evaluate LLMs’ Format-Following Capability
%A Xia, Congying
%A Xing, Chen
%A Du, Jiangshu
%A Yang, Xinyi
%A Feng, Yihao
%A Xu, Ran
%A Yin, Wenpeng
%A Xiong, Caiming
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F xia-etal-2024-fofo
%X This paper presents FoFo, a pioneering benchmark for evaluating large language models’ (LLMs) ability to follow complex, domain-specific formats, a crucial yet under-examined capability for their application as AI agents. Despite LLMs’ advancements, existing benchmarks fail to assess their format-following proficiency adequately. FoFo fills this gap with a diverse range of real-world formats and instructions, developed through an AI-Human collaborative method. Our evaluation across both open-source (e.g., Llama 2, WizardLM) and closed-source (e.g., GPT-4, PALM2, Gemini) LLMs highlights three key findings: open-source models significantly lag behind closed-source ones in format adherence; LLMs’ format-following performance is independent of their content generation quality; and LLMs’ format proficiency varies across different domains. These insights suggest the need for specialized tuning for format-following skills and highlight FoFo’s role in guiding the selection of domain-specific AI agents. FoFo will be publicly released, contributing a critical tool for advancing LLM evaluation and application.
%R 10.18653/v1/2024.acl-long.40
%U https://aclanthology.org/2024.acl-long.40
%U https://doi.org/10.18653/v1/2024.acl-long.40
%P 680-699
Markdown (Informal)
[FOFO: A Benchmark to Evaluate LLMs’ Format-Following Capability](https://aclanthology.org/2024.acl-long.40) (Xia et al., ACL 2024)
ACL
- Congying Xia, Chen Xing, Jiangshu Du, Xinyi Yang, Yihao Feng, Ran Xu, Wenpeng Yin, and Caiming Xiong. 2024. FOFO: A Benchmark to Evaluate LLMs’ Format-Following Capability. In Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 680–699, Bangkok, Thailand. Association for Computational Linguistics.