@inproceedings{sahay-etal-2020-low,
title = "Low Rank Fusion based Transformers for Multimodal Sequences",
author = "Sahay, Saurav and
Okur, Eda and
H Kumar, Shachi and
Nachman, Lama",
editor = "Zadeh, Amir and
Morency, Louis-Philippe and
Liang, Paul Pu and
Poria, Soujanya",
booktitle = "Second Grand-Challenge and Workshop on Multimodal Language (Challenge-HML)",
month = jul,
year = "2020",
address = "Seattle, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.challengehml-1.4/",
doi = "10.18653/v1/2020.challengehml-1.4",
pages = "29--34",
abstract = "Our senses individually work in a coordinated fashion to express our emotional intentions. In this work, we experiment with modeling modality-specific sensory signals to attend to our latent multimodal emotional intentions and vice versa expressed via low-rank multimodal fusion and multimodal transformers. The low-rank factorization of multimodal fusion amongst the modalities helps represent approximate multiplicative latent signal interactions. Motivated by the work of (CITATION) and (CITATION), we present our transformer-based cross-fusion architecture without any over-parameterization of the model. The low-rank fusion helps represent the latent signal interactions while the modality-specific attention helps focus on relevant parts of the signal. We present two methods for the Multimodal Sentiment and Emotion Recognition results on CMU-MOSEI, CMU-MOSI, and IEMOCAP datasets and show that our models have lesser parameters, train faster and perform comparably to many larger fusion-based architectures."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sahay-etal-2020-low">
<titleInfo>
<title>Low Rank Fusion based Transformers for Multimodal Sequences</title>
</titleInfo>
<name type="personal">
<namePart type="given">Saurav</namePart>
<namePart type="family">Sahay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eda</namePart>
<namePart type="family">Okur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shachi</namePart>
<namePart type="family">H Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lama</namePart>
<namePart type="family">Nachman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Second Grand-Challenge and Workshop on Multimodal Language (Challenge-HML)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Amir</namePart>
<namePart type="family">Zadeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Louis-Philippe</namePart>
<namePart type="family">Morency</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="given">Pu</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Soujanya</namePart>
<namePart type="family">Poria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Seattle, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Our senses individually work in a coordinated fashion to express our emotional intentions. In this work, we experiment with modeling modality-specific sensory signals to attend to our latent multimodal emotional intentions and vice versa expressed via low-rank multimodal fusion and multimodal transformers. The low-rank factorization of multimodal fusion amongst the modalities helps represent approximate multiplicative latent signal interactions. Motivated by the work of (CITATION) and (CITATION), we present our transformer-based cross-fusion architecture without any over-parameterization of the model. The low-rank fusion helps represent the latent signal interactions while the modality-specific attention helps focus on relevant parts of the signal. We present two methods for the Multimodal Sentiment and Emotion Recognition results on CMU-MOSEI, CMU-MOSI, and IEMOCAP datasets and show that our models have lesser parameters, train faster and perform comparably to many larger fusion-based architectures.</abstract>
<identifier type="citekey">sahay-etal-2020-low</identifier>
<identifier type="doi">10.18653/v1/2020.challengehml-1.4</identifier>
<location>
<url>https://aclanthology.org/2020.challengehml-1.4/</url>
</location>
<part>
<date>2020-07</date>
<extent unit="page">
<start>29</start>
<end>34</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Low Rank Fusion based Transformers for Multimodal Sequences
%A Sahay, Saurav
%A Okur, Eda
%A H Kumar, Shachi
%A Nachman, Lama
%Y Zadeh, Amir
%Y Morency, Louis-Philippe
%Y Liang, Paul Pu
%Y Poria, Soujanya
%S Second Grand-Challenge and Workshop on Multimodal Language (Challenge-HML)
%D 2020
%8 July
%I Association for Computational Linguistics
%C Seattle, USA
%F sahay-etal-2020-low
%X Our senses individually work in a coordinated fashion to express our emotional intentions. In this work, we experiment with modeling modality-specific sensory signals to attend to our latent multimodal emotional intentions and vice versa expressed via low-rank multimodal fusion and multimodal transformers. The low-rank factorization of multimodal fusion amongst the modalities helps represent approximate multiplicative latent signal interactions. Motivated by the work of (CITATION) and (CITATION), we present our transformer-based cross-fusion architecture without any over-parameterization of the model. The low-rank fusion helps represent the latent signal interactions while the modality-specific attention helps focus on relevant parts of the signal. We present two methods for the Multimodal Sentiment and Emotion Recognition results on CMU-MOSEI, CMU-MOSI, and IEMOCAP datasets and show that our models have lesser parameters, train faster and perform comparably to many larger fusion-based architectures.
%R 10.18653/v1/2020.challengehml-1.4
%U https://aclanthology.org/2020.challengehml-1.4/
%U https://doi.org/10.18653/v1/2020.challengehml-1.4
%P 29-34
Markdown (Informal)
[Low Rank Fusion based Transformers for Multimodal Sequences](https://aclanthology.org/2020.challengehml-1.4/) (Sahay et al., Challenge-HML 2020)
ACL