@inproceedings{maharana-etal-2022-multimodal,
title = "Multimodal Intent Discovery from Livestream Videos",
author = "Maharana, Adyasha and
Tran, Quan and
Dernoncourt, Franck and
Yoon, Seunghyun and
Bui, Trung and
Chang, Walter and
Bansal, Mohit",
editor = "Carpuat, Marine and
de Marneffe, Marie-Catherine and
Meza Ruiz, Ivan Vladimir",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2022",
month = jul,
year = "2022",
address = "Seattle, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-naacl.36",
doi = "10.18653/v1/2022.findings-naacl.36",
pages = "476--489",
abstract = "Individuals, educational institutions, and businesses are prolific at generating instructional video content such as {``}how-to{''} and tutorial guides. While significant progress has been made in basic video understanding tasks, identifying procedural intent within these instructional videos is a challenging and important task that remains unexplored but essential to video summarization, search, and recommendations. This paper introduces the problem of instructional intent identification and extraction from software instructional livestreams. We construct and present a new multimodal dataset consisting of software instructional livestreams and containing manual annotations for both detailed and abstract procedural intent that enable training and evaluation of joint video and text understanding models. We then introduce a multimodal cascaded cross-attention model to efficiently combine the weaker and noisier video signal with the more discriminative text signal. Our experiments show that our proposed model brings significant gains compared to strong baselines, including large-scale pretrained multimodal models. Our analysis further identifies that the task benefits from spatial as well as motion features extracted from videos, and provides insight on how the video signal is preferentially used for intent discovery. We also show that current models struggle to comprehend the nature of abstract intents, revealing important gaps in multimodal understanding and paving the way for future work.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="maharana-etal-2022-multimodal">
<titleInfo>
<title>Multimodal Intent Discovery from Livestream Videos</title>
</titleInfo>
<name type="personal">
<namePart type="given">Adyasha</namePart>
<namePart type="family">Maharana</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Quan</namePart>
<namePart type="family">Tran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Franck</namePart>
<namePart type="family">Dernoncourt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seunghyun</namePart>
<namePart type="family">Yoon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Trung</namePart>
<namePart type="family">Bui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Walter</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marine</namePart>
<namePart type="family">Carpuat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie-Catherine</namePart>
<namePart type="family">de Marneffe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="given">Vladimir</namePart>
<namePart type="family">Meza Ruiz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Seattle, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Individuals, educational institutions, and businesses are prolific at generating instructional video content such as “how-to” and tutorial guides. While significant progress has been made in basic video understanding tasks, identifying procedural intent within these instructional videos is a challenging and important task that remains unexplored but essential to video summarization, search, and recommendations. This paper introduces the problem of instructional intent identification and extraction from software instructional livestreams. We construct and present a new multimodal dataset consisting of software instructional livestreams and containing manual annotations for both detailed and abstract procedural intent that enable training and evaluation of joint video and text understanding models. We then introduce a multimodal cascaded cross-attention model to efficiently combine the weaker and noisier video signal with the more discriminative text signal. Our experiments show that our proposed model brings significant gains compared to strong baselines, including large-scale pretrained multimodal models. Our analysis further identifies that the task benefits from spatial as well as motion features extracted from videos, and provides insight on how the video signal is preferentially used for intent discovery. We also show that current models struggle to comprehend the nature of abstract intents, revealing important gaps in multimodal understanding and paving the way for future work.</abstract>
<identifier type="citekey">maharana-etal-2022-multimodal</identifier>
<identifier type="doi">10.18653/v1/2022.findings-naacl.36</identifier>
<location>
<url>https://aclanthology.org/2022.findings-naacl.36</url>
</location>
<part>
<date>2022-07</date>
<extent unit="page">
<start>476</start>
<end>489</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multimodal Intent Discovery from Livestream Videos
%A Maharana, Adyasha
%A Tran, Quan
%A Dernoncourt, Franck
%A Yoon, Seunghyun
%A Bui, Trung
%A Chang, Walter
%A Bansal, Mohit
%Y Carpuat, Marine
%Y de Marneffe, Marie-Catherine
%Y Meza Ruiz, Ivan Vladimir
%S Findings of the Association for Computational Linguistics: NAACL 2022
%D 2022
%8 July
%I Association for Computational Linguistics
%C Seattle, United States
%F maharana-etal-2022-multimodal
%X Individuals, educational institutions, and businesses are prolific at generating instructional video content such as “how-to” and tutorial guides. While significant progress has been made in basic video understanding tasks, identifying procedural intent within these instructional videos is a challenging and important task that remains unexplored but essential to video summarization, search, and recommendations. This paper introduces the problem of instructional intent identification and extraction from software instructional livestreams. We construct and present a new multimodal dataset consisting of software instructional livestreams and containing manual annotations for both detailed and abstract procedural intent that enable training and evaluation of joint video and text understanding models. We then introduce a multimodal cascaded cross-attention model to efficiently combine the weaker and noisier video signal with the more discriminative text signal. Our experiments show that our proposed model brings significant gains compared to strong baselines, including large-scale pretrained multimodal models. Our analysis further identifies that the task benefits from spatial as well as motion features extracted from videos, and provides insight on how the video signal is preferentially used for intent discovery. We also show that current models struggle to comprehend the nature of abstract intents, revealing important gaps in multimodal understanding and paving the way for future work.
%R 10.18653/v1/2022.findings-naacl.36
%U https://aclanthology.org/2022.findings-naacl.36
%U https://doi.org/10.18653/v1/2022.findings-naacl.36
%P 476-489
Markdown (Informal)
[Multimodal Intent Discovery from Livestream Videos](https://aclanthology.org/2022.findings-naacl.36) (Maharana et al., Findings 2022)
ACL
- Adyasha Maharana, Quan Tran, Franck Dernoncourt, Seunghyun Yoon, Trung Bui, Walter Chang, and Mohit Bansal. 2022. Multimodal Intent Discovery from Livestream Videos. In Findings of the Association for Computational Linguistics: NAACL 2022, pages 476–489, Seattle, United States. Association for Computational Linguistics.