@inproceedings{kaneko-2024-enhancing,
title = "Enhancing Emotion Recognition in Spoken Dialogue Systems through Multimodal Integration and Personalization",
author = "Kaneko, Takumasa",
editor = "Inoue, Koji and
Fu, Yahui and
Axelsson, Agnes and
Ohashi, Atsumoto and
Madureira, Brielen and
Zenimoto, Yuki and
Mohapatra, Biswesh and
Stricker, Armand and
Khosla, Sopan",
booktitle = "Proceedings of the 20th Workshop of Young Researchers' Roundtable on Spoken Dialogue Systems",
month = sep,
year = "2024",
address = "Kyoto, Japan",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.yrrsds-1.2/",
pages = "5--7",
abstract = "My research interests focus on multimodal emotion recognition and personalization in emotion recognition tasks. In multimodal emotion recognition, existing studies demonstrate that integrating various data types like speech, text, and video enhances accuracy. However, real-time constraints and high dataset costs limit their practical application. I propose constructing a multimodal emotion recognition model by combining available unimodal datasets. In terms of personalization, traditional discrete emotion labels often fail to capture the complexity of human emotions. Although recent methods embed speaker characteristics to boost prediction accuracy, they require extensive retraining. I introduce continuous prompt tuning, which updates only the speaker prompts while keeping the speech encoder weights fixed, enabling the addition of new speaker data without additional retraining. This paper discusses these existing research gaps and presents novel approaches to address them, aiming to significantly improve emotion recognition in spoken dialogue systems."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kaneko-2024-enhancing">
<titleInfo>
<title>Enhancing Emotion Recognition in Spoken Dialogue Systems through Multimodal Integration and Personalization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Takumasa</namePart>
<namePart type="family">Kaneko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th Workshop of Young Researchers’ Roundtable on Spoken Dialogue Systems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Koji</namePart>
<namePart type="family">Inoue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yahui</namePart>
<namePart type="family">Fu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Agnes</namePart>
<namePart type="family">Axelsson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Atsumoto</namePart>
<namePart type="family">Ohashi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Brielen</namePart>
<namePart type="family">Madureira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuki</namePart>
<namePart type="family">Zenimoto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Biswesh</namePart>
<namePart type="family">Mohapatra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Armand</namePart>
<namePart type="family">Stricker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sopan</namePart>
<namePart type="family">Khosla</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Kyoto, Japan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>My research interests focus on multimodal emotion recognition and personalization in emotion recognition tasks. In multimodal emotion recognition, existing studies demonstrate that integrating various data types like speech, text, and video enhances accuracy. However, real-time constraints and high dataset costs limit their practical application. I propose constructing a multimodal emotion recognition model by combining available unimodal datasets. In terms of personalization, traditional discrete emotion labels often fail to capture the complexity of human emotions. Although recent methods embed speaker characteristics to boost prediction accuracy, they require extensive retraining. I introduce continuous prompt tuning, which updates only the speaker prompts while keeping the speech encoder weights fixed, enabling the addition of new speaker data without additional retraining. This paper discusses these existing research gaps and presents novel approaches to address them, aiming to significantly improve emotion recognition in spoken dialogue systems.</abstract>
<identifier type="citekey">kaneko-2024-enhancing</identifier>
<location>
<url>https://aclanthology.org/2024.yrrsds-1.2/</url>
</location>
<part>
<date>2024-09</date>
<extent unit="page">
<start>5</start>
<end>7</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Enhancing Emotion Recognition in Spoken Dialogue Systems through Multimodal Integration and Personalization
%A Kaneko, Takumasa
%Y Inoue, Koji
%Y Fu, Yahui
%Y Axelsson, Agnes
%Y Ohashi, Atsumoto
%Y Madureira, Brielen
%Y Zenimoto, Yuki
%Y Mohapatra, Biswesh
%Y Stricker, Armand
%Y Khosla, Sopan
%S Proceedings of the 20th Workshop of Young Researchers’ Roundtable on Spoken Dialogue Systems
%D 2024
%8 September
%I Association for Computational Linguistics
%C Kyoto, Japan
%F kaneko-2024-enhancing
%X My research interests focus on multimodal emotion recognition and personalization in emotion recognition tasks. In multimodal emotion recognition, existing studies demonstrate that integrating various data types like speech, text, and video enhances accuracy. However, real-time constraints and high dataset costs limit their practical application. I propose constructing a multimodal emotion recognition model by combining available unimodal datasets. In terms of personalization, traditional discrete emotion labels often fail to capture the complexity of human emotions. Although recent methods embed speaker characteristics to boost prediction accuracy, they require extensive retraining. I introduce continuous prompt tuning, which updates only the speaker prompts while keeping the speech encoder weights fixed, enabling the addition of new speaker data without additional retraining. This paper discusses these existing research gaps and presents novel approaches to address them, aiming to significantly improve emotion recognition in spoken dialogue systems.
%U https://aclanthology.org/2024.yrrsds-1.2/
%P 5-7
Markdown (Informal)
[Enhancing Emotion Recognition in Spoken Dialogue Systems through Multimodal Integration and Personalization](https://aclanthology.org/2024.yrrsds-1.2/) (Kaneko, YRRSDS 2024)
ACL