@inproceedings{zhang-etal-2024-dialogstudio,
title = "{D}ialog{S}tudio: Towards Richest and Most Diverse Unified Dataset Collection for Conversational {AI}",
author = "Zhang, Jianguo and
Qian, Kun and
Liu, Zhiwei and
Heinecke, Shelby and
Meng, Rui and
Liu, Ye and
Yu, Zhou and
Wang, Huan and
Savarese, Silvio and
Xiong, Caiming",
editor = "Graham, Yvette and
Purver, Matthew",
booktitle = "Findings of the Association for Computational Linguistics: EACL 2024",
month = mar,
year = "2024",
address = "St. Julian{'}s, Malta",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-eacl.152",
pages = "2299--2315",
abstract = "Despite advancements in conversational AI, language models encounter challenges to handle diverse conversational tasks, and existing dialogue dataset collections often lack diversity and comprehensiveness. To tackle these issues, we introduce DialogStudio: the largest and most diverse collection of dialogue datasets, unified under a consistent format while preserving their original information. Our collection encompasses data from open-domain dialogues, task-oriented dialogues, natural language understanding, conversational recommendation, dialogue summarization, and knowledge-grounded dialogues, making it an incredibly rich and diverse resource for dialogue research and model training.To further enhance the utility of DialogStudio, we identify the licenses for each dataset, design external knowledge and domain-aware prompts for selected dialogues to facilitate instruction-aware fine-tuning. To improve transparency and support dataset and task-based research, as well as language model pre-training, all datasets, licenses, codes, and models associated with DialogStudio will be made publicly accessible.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2024-dialogstudio">
<titleInfo>
<title>DialogStudio: Towards Richest and Most Diverse Unified Dataset Collection for Conversational AI</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jianguo</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kun</namePart>
<namePart type="family">Qian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhiwei</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shelby</namePart>
<namePart type="family">Heinecke</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rui</namePart>
<namePart type="family">Meng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ye</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhou</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Huan</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Silvio</namePart>
<namePart type="family">Savarese</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Caiming</namePart>
<namePart type="family">Xiong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yvette</namePart>
<namePart type="family">Graham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthew</namePart>
<namePart type="family">Purver</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">St. Julian’s, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Despite advancements in conversational AI, language models encounter challenges to handle diverse conversational tasks, and existing dialogue dataset collections often lack diversity and comprehensiveness. To tackle these issues, we introduce DialogStudio: the largest and most diverse collection of dialogue datasets, unified under a consistent format while preserving their original information. Our collection encompasses data from open-domain dialogues, task-oriented dialogues, natural language understanding, conversational recommendation, dialogue summarization, and knowledge-grounded dialogues, making it an incredibly rich and diverse resource for dialogue research and model training.To further enhance the utility of DialogStudio, we identify the licenses for each dataset, design external knowledge and domain-aware prompts for selected dialogues to facilitate instruction-aware fine-tuning. To improve transparency and support dataset and task-based research, as well as language model pre-training, all datasets, licenses, codes, and models associated with DialogStudio will be made publicly accessible.</abstract>
<identifier type="citekey">zhang-etal-2024-dialogstudio</identifier>
<location>
<url>https://aclanthology.org/2024.findings-eacl.152</url>
</location>
<part>
<date>2024-03</date>
<extent unit="page">
<start>2299</start>
<end>2315</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T DialogStudio: Towards Richest and Most Diverse Unified Dataset Collection for Conversational AI
%A Zhang, Jianguo
%A Qian, Kun
%A Liu, Zhiwei
%A Heinecke, Shelby
%A Meng, Rui
%A Liu, Ye
%A Yu, Zhou
%A Wang, Huan
%A Savarese, Silvio
%A Xiong, Caiming
%Y Graham, Yvette
%Y Purver, Matthew
%S Findings of the Association for Computational Linguistics: EACL 2024
%D 2024
%8 March
%I Association for Computational Linguistics
%C St. Julian’s, Malta
%F zhang-etal-2024-dialogstudio
%X Despite advancements in conversational AI, language models encounter challenges to handle diverse conversational tasks, and existing dialogue dataset collections often lack diversity and comprehensiveness. To tackle these issues, we introduce DialogStudio: the largest and most diverse collection of dialogue datasets, unified under a consistent format while preserving their original information. Our collection encompasses data from open-domain dialogues, task-oriented dialogues, natural language understanding, conversational recommendation, dialogue summarization, and knowledge-grounded dialogues, making it an incredibly rich and diverse resource for dialogue research and model training.To further enhance the utility of DialogStudio, we identify the licenses for each dataset, design external knowledge and domain-aware prompts for selected dialogues to facilitate instruction-aware fine-tuning. To improve transparency and support dataset and task-based research, as well as language model pre-training, all datasets, licenses, codes, and models associated with DialogStudio will be made publicly accessible.
%U https://aclanthology.org/2024.findings-eacl.152
%P 2299-2315
Markdown (Informal)
[DialogStudio: Towards Richest and Most Diverse Unified Dataset Collection for Conversational AI](https://aclanthology.org/2024.findings-eacl.152) (Zhang et al., Findings 2024)
ACL
- Jianguo Zhang, Kun Qian, Zhiwei Liu, Shelby Heinecke, Rui Meng, Ye Liu, Zhou Yu, Huan Wang, Silvio Savarese, and Caiming Xiong. 2024. DialogStudio: Towards Richest and Most Diverse Unified Dataset Collection for Conversational AI. In Findings of the Association for Computational Linguistics: EACL 2024, pages 2299–2315, St. Julian’s, Malta. Association for Computational Linguistics.