@inproceedings{pfister-hotho-2024-supergleber,
title = "{S}uper{GLEB}er: {G}erman Language Understanding Evaluation Benchmark",
author = "Pfister, Jan and
Hotho, Andreas",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.naacl-long.438/",
doi = "10.18653/v1/2024.naacl-long.438",
pages = "7904--7923",
abstract = "We assemble a broad Natural Language Understanding benchmark suite for the German language and consequently evaluate a wide array of existing German-capable models in order to create a better understanding of the current state of German LLMs. Our benchmark consists of 29 different tasks ranging over different types such as document classification, sequence tagging, sentence similarity, and question answering, on which we evaluate 10 different German-pretrained models, thereby charting the landscape of German LLMs. In our comprehensive evaluation we find that encoder models are a good choice for most tasks, but also that the largest encoder model does not necessarily perform best for all tasks. We make our benchmark suite and a leaderboard publically available at https://supergleber.professor-x.de and encourage the community to contribute new tasks and evaluate more models on it (https://github.com/LSX-UniWue/SuperGLEBer)."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pfister-hotho-2024-supergleber">
<titleInfo>
<title>SuperGLEBer: German Language Understanding Evaluation Benchmark</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Pfister</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Hotho</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We assemble a broad Natural Language Understanding benchmark suite for the German language and consequently evaluate a wide array of existing German-capable models in order to create a better understanding of the current state of German LLMs. Our benchmark consists of 29 different tasks ranging over different types such as document classification, sequence tagging, sentence similarity, and question answering, on which we evaluate 10 different German-pretrained models, thereby charting the landscape of German LLMs. In our comprehensive evaluation we find that encoder models are a good choice for most tasks, but also that the largest encoder model does not necessarily perform best for all tasks. We make our benchmark suite and a leaderboard publically available at https://supergleber.professor-x.de and encourage the community to contribute new tasks and evaluate more models on it (https://github.com/LSX-UniWue/SuperGLEBer).</abstract>
<identifier type="citekey">pfister-hotho-2024-supergleber</identifier>
<identifier type="doi">10.18653/v1/2024.naacl-long.438</identifier>
<location>
<url>https://aclanthology.org/2024.naacl-long.438/</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>7904</start>
<end>7923</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SuperGLEBer: German Language Understanding Evaluation Benchmark
%A Pfister, Jan
%A Hotho, Andreas
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F pfister-hotho-2024-supergleber
%X We assemble a broad Natural Language Understanding benchmark suite for the German language and consequently evaluate a wide array of existing German-capable models in order to create a better understanding of the current state of German LLMs. Our benchmark consists of 29 different tasks ranging over different types such as document classification, sequence tagging, sentence similarity, and question answering, on which we evaluate 10 different German-pretrained models, thereby charting the landscape of German LLMs. In our comprehensive evaluation we find that encoder models are a good choice for most tasks, but also that the largest encoder model does not necessarily perform best for all tasks. We make our benchmark suite and a leaderboard publically available at https://supergleber.professor-x.de and encourage the community to contribute new tasks and evaluate more models on it (https://github.com/LSX-UniWue/SuperGLEBer).
%R 10.18653/v1/2024.naacl-long.438
%U https://aclanthology.org/2024.naacl-long.438/
%U https://doi.org/10.18653/v1/2024.naacl-long.438
%P 7904-7923
Markdown (Informal)
[SuperGLEBer: German Language Understanding Evaluation Benchmark](https://aclanthology.org/2024.naacl-long.438/) (Pfister & Hotho, NAACL 2024)
ACL
- Jan Pfister and Andreas Hotho. 2024. SuperGLEBer: German Language Understanding Evaluation Benchmark. In Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 7904–7923, Mexico City, Mexico. Association for Computational Linguistics.