@inproceedings{jandaghi-etal-2024-faithful-persona,
title = "Faithful Persona-based Conversational Dataset Generation with Large Language Models",
author = "Jandaghi, Pegah and
Sheng, Xianghai and
Bai, Xinyi and
Pujara, Jay and
Sidahmed, Hakim",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-acl.904/",
doi = "10.18653/v1/2024.findings-acl.904",
pages = "15245--15270",
abstract = "High-quality conversational datasets are essential for developing AI models that can communicate with users.One way to foster deeper interactions between a chatbot and its user is through *personas*, aspects of the user`s character that provide insights into their personality, motivations, and behaviors.Training Natural Language Processing (NLP) models on a diverse and comprehensive persona-based dataset can lead to conversational models that create a deeper connection with the user, and maintain their engagement. In this paper, we leverage the power of Large Language Models (LLMs) to create a large, high-quality conversational dataset from a seed dataset. We propose a Generator-Critic architecture framework to expand the initial dataset, while improving the quality of its conversations.The Generator is an LLM prompted to output conversations.The Critic consists of a mixture of expert LLMs that control the quality of the generated conversations.These experts select the best generated conversations, which we then use to improve the Generator.We release Synthetic-Persona-Chat, consisting of 20k conversations seeded from Persona-Chat.We evaluate the quality of Synthetic-Persona-Chat and our generation framework on different dimensions through extensive experiments, and observe that the losing rate of Synthetic-Persona-Chat against Persona-Chat during an AI detection test decreases from 17.2{\%} to 8.8{\%} over three iterations."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jandaghi-etal-2024-faithful-persona">
<titleInfo>
<title>Faithful Persona-based Conversational Dataset Generation with Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pegah</namePart>
<namePart type="family">Jandaghi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xianghai</namePart>
<namePart type="family">Sheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xinyi</namePart>
<namePart type="family">Bai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jay</namePart>
<namePart type="family">Pujara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hakim</namePart>
<namePart type="family">Sidahmed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>High-quality conversational datasets are essential for developing AI models that can communicate with users.One way to foster deeper interactions between a chatbot and its user is through *personas*, aspects of the user‘s character that provide insights into their personality, motivations, and behaviors.Training Natural Language Processing (NLP) models on a diverse and comprehensive persona-based dataset can lead to conversational models that create a deeper connection with the user, and maintain their engagement. In this paper, we leverage the power of Large Language Models (LLMs) to create a large, high-quality conversational dataset from a seed dataset. We propose a Generator-Critic architecture framework to expand the initial dataset, while improving the quality of its conversations.The Generator is an LLM prompted to output conversations.The Critic consists of a mixture of expert LLMs that control the quality of the generated conversations.These experts select the best generated conversations, which we then use to improve the Generator.We release Synthetic-Persona-Chat, consisting of 20k conversations seeded from Persona-Chat.We evaluate the quality of Synthetic-Persona-Chat and our generation framework on different dimensions through extensive experiments, and observe that the losing rate of Synthetic-Persona-Chat against Persona-Chat during an AI detection test decreases from 17.2% to 8.8% over three iterations.</abstract>
<identifier type="citekey">jandaghi-etal-2024-faithful-persona</identifier>
<identifier type="doi">10.18653/v1/2024.findings-acl.904</identifier>
<location>
<url>https://aclanthology.org/2024.findings-acl.904/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>15245</start>
<end>15270</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Faithful Persona-based Conversational Dataset Generation with Large Language Models
%A Jandaghi, Pegah
%A Sheng, Xianghai
%A Bai, Xinyi
%A Pujara, Jay
%A Sidahmed, Hakim
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Findings of the Association for Computational Linguistics: ACL 2024
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F jandaghi-etal-2024-faithful-persona
%X High-quality conversational datasets are essential for developing AI models that can communicate with users.One way to foster deeper interactions between a chatbot and its user is through *personas*, aspects of the user‘s character that provide insights into their personality, motivations, and behaviors.Training Natural Language Processing (NLP) models on a diverse and comprehensive persona-based dataset can lead to conversational models that create a deeper connection with the user, and maintain their engagement. In this paper, we leverage the power of Large Language Models (LLMs) to create a large, high-quality conversational dataset from a seed dataset. We propose a Generator-Critic architecture framework to expand the initial dataset, while improving the quality of its conversations.The Generator is an LLM prompted to output conversations.The Critic consists of a mixture of expert LLMs that control the quality of the generated conversations.These experts select the best generated conversations, which we then use to improve the Generator.We release Synthetic-Persona-Chat, consisting of 20k conversations seeded from Persona-Chat.We evaluate the quality of Synthetic-Persona-Chat and our generation framework on different dimensions through extensive experiments, and observe that the losing rate of Synthetic-Persona-Chat against Persona-Chat during an AI detection test decreases from 17.2% to 8.8% over three iterations.
%R 10.18653/v1/2024.findings-acl.904
%U https://aclanthology.org/2024.findings-acl.904/
%U https://doi.org/10.18653/v1/2024.findings-acl.904
%P 15245-15270
Markdown (Informal)
[Faithful Persona-based Conversational Dataset Generation with Large Language Models](https://aclanthology.org/2024.findings-acl.904/) (Jandaghi et al., Findings 2024)
ACL