@inproceedings{luukkonen-etal-2023-fingpt,
title = "{F}in{GPT}: Large Generative Models for a Small Language",
author = "Luukkonen, Risto and
Komulainen, Ville and
Luoma, Jouni and
Eskelinen, Anni and
Kanerva, Jenna and
Kupari, Hanna-Mari and
Ginter, Filip and
Laippala, Veronika and
Muennighoff, Niklas and
Piktus, Aleksandra and
Wang, Thomas and
Tazi, Nouamane and
Scao, Teven and
Wolf, Thomas and
Suominen, Osma and
Sairanen, Samuli and
Merioksa, Mikko and
Heinonen, Jyrki and
Vahtola, Aija and
Antao, Samuel and
Pyysalo, Sampo",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-main.164",
doi = "10.18653/v1/2023.emnlp-main.164",
pages = "2710--2726",
abstract = "Large language models (LLMs) excel in many tasks in NLP and beyond, but most open models have very limited coverage of smaller languages and LLM work tends to focus on languages where nearly unlimited data is available for pretraining. In this work, we study the challenges of creating LLMs for Finnish, a language spoken by less than 0.1{\%} of the world population. We compile an extensive dataset of Finnish combining web crawls, news, social media and eBooks. We pursue two approaches to pretrain models: 1) we train seven monolingual models from scratch (186M to 13B parameters) dubbed FinGPT, 2) we continue the pretraining of the multilingual BLOOM model on a mix of its original training data and Finnish, resulting in a 176 billion parameter model we call BLUUMI. For model evaluation, we introduce FIN-bench, a version of BIG-bench with Finnish tasks. We also assess other model qualities such as toxicity and bias. Our models and tools are openly available at \url{https://turkunlp.org/gpt3-finnish}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="luukkonen-etal-2023-fingpt">
<titleInfo>
<title>FinGPT: Large Generative Models for a Small Language</title>
</titleInfo>
<name type="personal">
<namePart type="given">Risto</namePart>
<namePart type="family">Luukkonen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ville</namePart>
<namePart type="family">Komulainen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jouni</namePart>
<namePart type="family">Luoma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anni</namePart>
<namePart type="family">Eskelinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jenna</namePart>
<namePart type="family">Kanerva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hanna-Mari</namePart>
<namePart type="family">Kupari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Filip</namePart>
<namePart type="family">Ginter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronika</namePart>
<namePart type="family">Laippala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Niklas</namePart>
<namePart type="family">Muennighoff</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aleksandra</namePart>
<namePart type="family">Piktus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nouamane</namePart>
<namePart type="family">Tazi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Teven</namePart>
<namePart type="family">Scao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Wolf</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Osma</namePart>
<namePart type="family">Suominen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samuli</namePart>
<namePart type="family">Sairanen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mikko</namePart>
<namePart type="family">Merioksa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jyrki</namePart>
<namePart type="family">Heinonen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aija</namePart>
<namePart type="family">Vahtola</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samuel</namePart>
<namePart type="family">Antao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sampo</namePart>
<namePart type="family">Pyysalo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large language models (LLMs) excel in many tasks in NLP and beyond, but most open models have very limited coverage of smaller languages and LLM work tends to focus on languages where nearly unlimited data is available for pretraining. In this work, we study the challenges of creating LLMs for Finnish, a language spoken by less than 0.1% of the world population. We compile an extensive dataset of Finnish combining web crawls, news, social media and eBooks. We pursue two approaches to pretrain models: 1) we train seven monolingual models from scratch (186M to 13B parameters) dubbed FinGPT, 2) we continue the pretraining of the multilingual BLOOM model on a mix of its original training data and Finnish, resulting in a 176 billion parameter model we call BLUUMI. For model evaluation, we introduce FIN-bench, a version of BIG-bench with Finnish tasks. We also assess other model qualities such as toxicity and bias. Our models and tools are openly available at https://turkunlp.org/gpt3-finnish.</abstract>
<identifier type="citekey">luukkonen-etal-2023-fingpt</identifier>
<identifier type="doi">10.18653/v1/2023.emnlp-main.164</identifier>
<location>
<url>https://aclanthology.org/2023.emnlp-main.164</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>2710</start>
<end>2726</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T FinGPT: Large Generative Models for a Small Language
%A Luukkonen, Risto
%A Komulainen, Ville
%A Luoma, Jouni
%A Eskelinen, Anni
%A Kanerva, Jenna
%A Kupari, Hanna-Mari
%A Ginter, Filip
%A Laippala, Veronika
%A Muennighoff, Niklas
%A Piktus, Aleksandra
%A Wang, Thomas
%A Tazi, Nouamane
%A Scao, Teven
%A Wolf, Thomas
%A Suominen, Osma
%A Sairanen, Samuli
%A Merioksa, Mikko
%A Heinonen, Jyrki
%A Vahtola, Aija
%A Antao, Samuel
%A Pyysalo, Sampo
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F luukkonen-etal-2023-fingpt
%X Large language models (LLMs) excel in many tasks in NLP and beyond, but most open models have very limited coverage of smaller languages and LLM work tends to focus on languages where nearly unlimited data is available for pretraining. In this work, we study the challenges of creating LLMs for Finnish, a language spoken by less than 0.1% of the world population. We compile an extensive dataset of Finnish combining web crawls, news, social media and eBooks. We pursue two approaches to pretrain models: 1) we train seven monolingual models from scratch (186M to 13B parameters) dubbed FinGPT, 2) we continue the pretraining of the multilingual BLOOM model on a mix of its original training data and Finnish, resulting in a 176 billion parameter model we call BLUUMI. For model evaluation, we introduce FIN-bench, a version of BIG-bench with Finnish tasks. We also assess other model qualities such as toxicity and bias. Our models and tools are openly available at https://turkunlp.org/gpt3-finnish.
%R 10.18653/v1/2023.emnlp-main.164
%U https://aclanthology.org/2023.emnlp-main.164
%U https://doi.org/10.18653/v1/2023.emnlp-main.164
%P 2710-2726
Markdown (Informal)
[FinGPT: Large Generative Models for a Small Language](https://aclanthology.org/2023.emnlp-main.164) (Luukkonen et al., EMNLP 2023)
ACL
- Risto Luukkonen, Ville Komulainen, Jouni Luoma, Anni Eskelinen, Jenna Kanerva, Hanna-Mari Kupari, Filip Ginter, Veronika Laippala, Niklas Muennighoff, Aleksandra Piktus, Thomas Wang, Nouamane Tazi, Teven Scao, Thomas Wolf, Osma Suominen, Samuli Sairanen, Mikko Merioksa, Jyrki Heinonen, Aija Vahtola, et al.. 2023. FinGPT: Large Generative Models for a Small Language. In Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, pages 2710–2726, Singapore. Association for Computational Linguistics.