@inproceedings{rau-etal-2024-bergen,
title = "{BERGEN}: A Benchmarking Library for Retrieval-Augmented Generation",
author = "Rau, David and
D{\'e}jean, Herv{\'e} and
Chirkova, Nadezhda and
Formal, Thibault and
Wang, Shuai and
Clinchant, St{\'e}phane and
Nikoulina, Vassilina",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.449/",
doi = "10.18653/v1/2024.findings-emnlp.449",
pages = "7640--7663",
abstract = "Retrieval-Augmented Generation allows to enhance Large Language Models with external knowledge. In response to the recent popularity of generative LLMs, many RAG approaches have been proposed, which involve an intricate number of different configurations such as evaluation datasets, collections, metrics, retrievers, and LLMs. Inconsistent benchmarking poses a major challenge in comparing approaches and understanding the impact of each component in the pipeline. In this work, we study best practices that lay the groundwork for a systematic evaluation of RAG and present BERGEN, an end-to-end library for reproducible research standardizing RAG experiments. In an extensive study focusing on QA, we benchmark different state-of-the-art retrievers, rerankers, and LLMs. Additionally, we analyze existing RAG metrics and datasets."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="rau-etal-2024-bergen">
<titleInfo>
<title>BERGEN: A Benchmarking Library for Retrieval-Augmented Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Rau</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hervé</namePart>
<namePart type="family">Déjean</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nadezhda</namePart>
<namePart type="family">Chirkova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thibault</namePart>
<namePart type="family">Formal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuai</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stéphane</namePart>
<namePart type="family">Clinchant</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vassilina</namePart>
<namePart type="family">Nikoulina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Retrieval-Augmented Generation allows to enhance Large Language Models with external knowledge. In response to the recent popularity of generative LLMs, many RAG approaches have been proposed, which involve an intricate number of different configurations such as evaluation datasets, collections, metrics, retrievers, and LLMs. Inconsistent benchmarking poses a major challenge in comparing approaches and understanding the impact of each component in the pipeline. In this work, we study best practices that lay the groundwork for a systematic evaluation of RAG and present BERGEN, an end-to-end library for reproducible research standardizing RAG experiments. In an extensive study focusing on QA, we benchmark different state-of-the-art retrievers, rerankers, and LLMs. Additionally, we analyze existing RAG metrics and datasets.</abstract>
<identifier type="citekey">rau-etal-2024-bergen</identifier>
<identifier type="doi">10.18653/v1/2024.findings-emnlp.449</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.449/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>7640</start>
<end>7663</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T BERGEN: A Benchmarking Library for Retrieval-Augmented Generation
%A Rau, David
%A Déjean, Hervé
%A Chirkova, Nadezhda
%A Formal, Thibault
%A Wang, Shuai
%A Clinchant, Stéphane
%A Nikoulina, Vassilina
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F rau-etal-2024-bergen
%X Retrieval-Augmented Generation allows to enhance Large Language Models with external knowledge. In response to the recent popularity of generative LLMs, many RAG approaches have been proposed, which involve an intricate number of different configurations such as evaluation datasets, collections, metrics, retrievers, and LLMs. Inconsistent benchmarking poses a major challenge in comparing approaches and understanding the impact of each component in the pipeline. In this work, we study best practices that lay the groundwork for a systematic evaluation of RAG and present BERGEN, an end-to-end library for reproducible research standardizing RAG experiments. In an extensive study focusing on QA, we benchmark different state-of-the-art retrievers, rerankers, and LLMs. Additionally, we analyze existing RAG metrics and datasets.
%R 10.18653/v1/2024.findings-emnlp.449
%U https://aclanthology.org/2024.findings-emnlp.449/
%U https://doi.org/10.18653/v1/2024.findings-emnlp.449
%P 7640-7663
Markdown (Informal)
[BERGEN: A Benchmarking Library for Retrieval-Augmented Generation](https://aclanthology.org/2024.findings-emnlp.449/) (Rau et al., Findings 2024)
ACL
- David Rau, Hervé Déjean, Nadezhda Chirkova, Thibault Formal, Shuai Wang, Stéphane Clinchant, and Vassilina Nikoulina. 2024. BERGEN: A Benchmarking Library for Retrieval-Augmented Generation. In Findings of the Association for Computational Linguistics: EMNLP 2024, pages 7640–7663, Miami, Florida, USA. Association for Computational Linguistics.