@inproceedings{leeb-scholkopf-2024-diverse,
title = "A diverse Multilingual News Headlines Dataset from around the World",
author = {Leeb, Felix and
Sch{\"o}lkopf, Bernhard},
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.naacl-short.55/",
doi = "10.18653/v1/2024.naacl-short.55",
pages = "647--652",
abstract = "Babel Briefings is a novel dataset featuring 4.7 million news headlines from August 2020 to November 2021, across 30 languages and 54 locations worldwide with English translations of all articles included. Designed for natural language processing and media studies, it serves as a high-quality dataset for training or evaluating language models as well as offering a simple, accessible collection of articles, for example, to analyze global news coverage and cultural narratives. As a simple demonstration of the analyses facilitated by this dataset, we use a basic procedure using a TF-IDF weighted similarity metric to group articles into clusters about the same event. We then visualize the \textit{event signatures} of the event showing articles of which languages appear over time, revealing intuitive features based on the proximity of the event and unexpectedness of the event. The dataset is available on [Kaggle](https://www.kaggle.com/datasets/felixludos/babel-briefings) and [HuggingFace](https://huggingface.co/datasets/felixludos/babel-briefings) with accompanying [GitHub](https://github.com/felixludos/babel-briefings) code."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="leeb-scholkopf-2024-diverse">
<titleInfo>
<title>A diverse Multilingual News Headlines Dataset from around the World</title>
</titleInfo>
<name type="personal">
<namePart type="given">Felix</namePart>
<namePart type="family">Leeb</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bernhard</namePart>
<namePart type="family">Schölkopf</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Babel Briefings is a novel dataset featuring 4.7 million news headlines from August 2020 to November 2021, across 30 languages and 54 locations worldwide with English translations of all articles included. Designed for natural language processing and media studies, it serves as a high-quality dataset for training or evaluating language models as well as offering a simple, accessible collection of articles, for example, to analyze global news coverage and cultural narratives. As a simple demonstration of the analyses facilitated by this dataset, we use a basic procedure using a TF-IDF weighted similarity metric to group articles into clusters about the same event. We then visualize the event signatures of the event showing articles of which languages appear over time, revealing intuitive features based on the proximity of the event and unexpectedness of the event. The dataset is available on [Kaggle](https://www.kaggle.com/datasets/felixludos/babel-briefings) and [HuggingFace](https://huggingface.co/datasets/felixludos/babel-briefings) with accompanying [GitHub](https://github.com/felixludos/babel-briefings) code.</abstract>
<identifier type="citekey">leeb-scholkopf-2024-diverse</identifier>
<identifier type="doi">10.18653/v1/2024.naacl-short.55</identifier>
<location>
<url>https://aclanthology.org/2024.naacl-short.55/</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>647</start>
<end>652</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A diverse Multilingual News Headlines Dataset from around the World
%A Leeb, Felix
%A Schölkopf, Bernhard
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers)
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F leeb-scholkopf-2024-diverse
%X Babel Briefings is a novel dataset featuring 4.7 million news headlines from August 2020 to November 2021, across 30 languages and 54 locations worldwide with English translations of all articles included. Designed for natural language processing and media studies, it serves as a high-quality dataset for training or evaluating language models as well as offering a simple, accessible collection of articles, for example, to analyze global news coverage and cultural narratives. As a simple demonstration of the analyses facilitated by this dataset, we use a basic procedure using a TF-IDF weighted similarity metric to group articles into clusters about the same event. We then visualize the event signatures of the event showing articles of which languages appear over time, revealing intuitive features based on the proximity of the event and unexpectedness of the event. The dataset is available on [Kaggle](https://www.kaggle.com/datasets/felixludos/babel-briefings) and [HuggingFace](https://huggingface.co/datasets/felixludos/babel-briefings) with accompanying [GitHub](https://github.com/felixludos/babel-briefings) code.
%R 10.18653/v1/2024.naacl-short.55
%U https://aclanthology.org/2024.naacl-short.55/
%U https://doi.org/10.18653/v1/2024.naacl-short.55
%P 647-652
Markdown (Informal)
[A diverse Multilingual News Headlines Dataset from around the World](https://aclanthology.org/2024.naacl-short.55/) (Leeb & Schölkopf, NAACL 2024)
ACL
- Felix Leeb and Bernhard Schölkopf. 2024. A diverse Multilingual News Headlines Dataset from around the World. In Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers), pages 647–652, Mexico City, Mexico. Association for Computational Linguistics.