@inproceedings{hemanthage-2024-generalized,
title = "Generalized Visual-Language Grounding with Complex Language Context",
author = "Hemanthage, Bhathiya",
editor = "Inoue, Koji and
Fu, Yahui and
Axelsson, Agnes and
Ohashi, Atsumoto and
Madureira, Brielen and
Zenimoto, Yuki and
Mohapatra, Biswesh and
Stricker, Armand and
Khosla, Sopan",
booktitle = "Proceedings of the 20th Workshop of Young Researchers' Roundtable on Spoken Dialogue Systems",
month = sep,
year = "2024",
address = "Kyoto, Japan",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.yrrsds-1.21/",
pages = "57--59",
abstract = "My research focus on \textbf{Visual Dialogues} and \textbf{Generalized Visual-Language Grounding with Complex Language Context}. Specifically, my research aim to utilize Large Language Models (LLMs) to build \textit{conversational agents capable of comprehending and responding to visual cues}. Visual-Language Pre-trained (VLP) models, primarily utilizing transformer-based encoder-decoder architectures, are extensively employed across a range of visual-language tasks, such as visual question answering (VQA) and referring expression comprehension (REC). The effectiveness of these models stems from their robust visual-language integration capabilities. However, their performance is constrained in more complex applications like multimodal conversational agents, where intricate and extensive language contexts pose significant challenges. These tasks demands language-only reasoning before engaging in multimodal fusion. In response, my research investigates the application of Large Language Models (LLMs) with advance comprehension and generation capabilities to enhance performance in complex multimodal tasks, particularly multimodal dialogues. In brief, my work in visual dialogues revolves around two major research questions. i) How to redefine visually grounded conversational agent architectures to benefit from LLMs ii) How to transfer the large body of knowledge encoded in LLMs to conversational systems."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hemanthage-2024-generalized">
<titleInfo>
<title>Generalized Visual-Language Grounding with Complex Language Context</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bhathiya</namePart>
<namePart type="family">Hemanthage</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th Workshop of Young Researchers’ Roundtable on Spoken Dialogue Systems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Koji</namePart>
<namePart type="family">Inoue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yahui</namePart>
<namePart type="family">Fu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Agnes</namePart>
<namePart type="family">Axelsson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Atsumoto</namePart>
<namePart type="family">Ohashi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Brielen</namePart>
<namePart type="family">Madureira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuki</namePart>
<namePart type="family">Zenimoto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Biswesh</namePart>
<namePart type="family">Mohapatra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Armand</namePart>
<namePart type="family">Stricker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sopan</namePart>
<namePart type="family">Khosla</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Kyoto, Japan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>My research focus on Visual Dialogues and Generalized Visual-Language Grounding with Complex Language Context. Specifically, my research aim to utilize Large Language Models (LLMs) to build conversational agents capable of comprehending and responding to visual cues. Visual-Language Pre-trained (VLP) models, primarily utilizing transformer-based encoder-decoder architectures, are extensively employed across a range of visual-language tasks, such as visual question answering (VQA) and referring expression comprehension (REC). The effectiveness of these models stems from their robust visual-language integration capabilities. However, their performance is constrained in more complex applications like multimodal conversational agents, where intricate and extensive language contexts pose significant challenges. These tasks demands language-only reasoning before engaging in multimodal fusion. In response, my research investigates the application of Large Language Models (LLMs) with advance comprehension and generation capabilities to enhance performance in complex multimodal tasks, particularly multimodal dialogues. In brief, my work in visual dialogues revolves around two major research questions. i) How to redefine visually grounded conversational agent architectures to benefit from LLMs ii) How to transfer the large body of knowledge encoded in LLMs to conversational systems.</abstract>
<identifier type="citekey">hemanthage-2024-generalized</identifier>
<location>
<url>https://aclanthology.org/2024.yrrsds-1.21/</url>
</location>
<part>
<date>2024-09</date>
<extent unit="page">
<start>57</start>
<end>59</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Generalized Visual-Language Grounding with Complex Language Context
%A Hemanthage, Bhathiya
%Y Inoue, Koji
%Y Fu, Yahui
%Y Axelsson, Agnes
%Y Ohashi, Atsumoto
%Y Madureira, Brielen
%Y Zenimoto, Yuki
%Y Mohapatra, Biswesh
%Y Stricker, Armand
%Y Khosla, Sopan
%S Proceedings of the 20th Workshop of Young Researchers’ Roundtable on Spoken Dialogue Systems
%D 2024
%8 September
%I Association for Computational Linguistics
%C Kyoto, Japan
%F hemanthage-2024-generalized
%X My research focus on Visual Dialogues and Generalized Visual-Language Grounding with Complex Language Context. Specifically, my research aim to utilize Large Language Models (LLMs) to build conversational agents capable of comprehending and responding to visual cues. Visual-Language Pre-trained (VLP) models, primarily utilizing transformer-based encoder-decoder architectures, are extensively employed across a range of visual-language tasks, such as visual question answering (VQA) and referring expression comprehension (REC). The effectiveness of these models stems from their robust visual-language integration capabilities. However, their performance is constrained in more complex applications like multimodal conversational agents, where intricate and extensive language contexts pose significant challenges. These tasks demands language-only reasoning before engaging in multimodal fusion. In response, my research investigates the application of Large Language Models (LLMs) with advance comprehension and generation capabilities to enhance performance in complex multimodal tasks, particularly multimodal dialogues. In brief, my work in visual dialogues revolves around two major research questions. i) How to redefine visually grounded conversational agent architectures to benefit from LLMs ii) How to transfer the large body of knowledge encoded in LLMs to conversational systems.
%U https://aclanthology.org/2024.yrrsds-1.21/
%P 57-59
Markdown (Informal)
[Generalized Visual-Language Grounding with Complex Language Context](https://aclanthology.org/2024.yrrsds-1.21/) (Hemanthage, YRRSDS 2024)
ACL