@inproceedings{leong-etal-2022-bloom,
title = "Bloom Library: Multimodal Datasets in 300+ Languages for a Variety of Downstream Tasks",
author = "Leong, Colin and
Nemecek, Joshua and
Mansdorfer, Jacob and
Filighera, Anna and
Owodunni, Abraham and
Whitenack, Daniel",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.emnlp-main.590/",
doi = "10.18653/v1/2022.emnlp-main.590",
pages = "8608--8621",
abstract = "We present Bloom Library, a linguistically diverse set of multimodal and multilingual datasets for language modeling, image captioning, visual storytelling, and speech synthesis/recognition. These datasets represent either the most, or among the most, multilingual datasets for each of the included downstream tasks. In total, the initial release of the Bloom Library datasets covers 363 languages across 32 language families. We train downstream task models for various languages represented in the data, showing the viability of the data for future work in low-resource, multimodal NLP and establishing the first known baselines for these downstream tasks in certain languages (e.g., Bisu [bzi], with an estimated population of 700 users). Some of these first-of-their-kind baselines are comparable to state-of-the-art performance for higher-resourced languages. The Bloom Library datasets are released under Creative Commons licenses on the Hugging Face datasets hub to catalyze more linguistically diverse research in the included downstream tasks."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="leong-etal-2022-bloom">
<titleInfo>
<title>Bloom Library: Multimodal Datasets in 300+ Languages for a Variety of Downstream Tasks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Colin</namePart>
<namePart type="family">Leong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joshua</namePart>
<namePart type="family">Nemecek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jacob</namePart>
<namePart type="family">Mansdorfer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Filighera</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abraham</namePart>
<namePart type="family">Owodunni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Whitenack</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present Bloom Library, a linguistically diverse set of multimodal and multilingual datasets for language modeling, image captioning, visual storytelling, and speech synthesis/recognition. These datasets represent either the most, or among the most, multilingual datasets for each of the included downstream tasks. In total, the initial release of the Bloom Library datasets covers 363 languages across 32 language families. We train downstream task models for various languages represented in the data, showing the viability of the data for future work in low-resource, multimodal NLP and establishing the first known baselines for these downstream tasks in certain languages (e.g., Bisu [bzi], with an estimated population of 700 users). Some of these first-of-their-kind baselines are comparable to state-of-the-art performance for higher-resourced languages. The Bloom Library datasets are released under Creative Commons licenses on the Hugging Face datasets hub to catalyze more linguistically diverse research in the included downstream tasks.</abstract>
<identifier type="citekey">leong-etal-2022-bloom</identifier>
<identifier type="doi">10.18653/v1/2022.emnlp-main.590</identifier>
<location>
<url>https://aclanthology.org/2022.emnlp-main.590/</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>8608</start>
<end>8621</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Bloom Library: Multimodal Datasets in 300+ Languages for a Variety of Downstream Tasks
%A Leong, Colin
%A Nemecek, Joshua
%A Mansdorfer, Jacob
%A Filighera, Anna
%A Owodunni, Abraham
%A Whitenack, Daniel
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F leong-etal-2022-bloom
%X We present Bloom Library, a linguistically diverse set of multimodal and multilingual datasets for language modeling, image captioning, visual storytelling, and speech synthesis/recognition. These datasets represent either the most, or among the most, multilingual datasets for each of the included downstream tasks. In total, the initial release of the Bloom Library datasets covers 363 languages across 32 language families. We train downstream task models for various languages represented in the data, showing the viability of the data for future work in low-resource, multimodal NLP and establishing the first known baselines for these downstream tasks in certain languages (e.g., Bisu [bzi], with an estimated population of 700 users). Some of these first-of-their-kind baselines are comparable to state-of-the-art performance for higher-resourced languages. The Bloom Library datasets are released under Creative Commons licenses on the Hugging Face datasets hub to catalyze more linguistically diverse research in the included downstream tasks.
%R 10.18653/v1/2022.emnlp-main.590
%U https://aclanthology.org/2022.emnlp-main.590/
%U https://doi.org/10.18653/v1/2022.emnlp-main.590
%P 8608-8621
Markdown (Informal)
[Bloom Library: Multimodal Datasets in 300+ Languages for a Variety of Downstream Tasks](https://aclanthology.org/2022.emnlp-main.590/) (Leong et al., EMNLP 2022)
ACL