@inproceedings{yu-etal-2022-dual,
title = "Dual-Encoder Transformers with Cross-modal Alignment for Multimodal Aspect-based Sentiment Analysis",
author = "Yu, Zhewen and
Wang, Jin and
Yu, Liang-Chih and
Zhang, Xuejie",
editor = "He, Yulan and
Ji, Heng and
Li, Sujian and
Liu, Yang and
Chang, Chua-Hui",
booktitle = "Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)",
month = nov,
year = "2022",
address = "Online only",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.aacl-main.32/",
doi = "10.18653/v1/2022.aacl-main.32",
pages = "414--423",
abstract = "Multimodal aspect-based sentiment analysis (MABSA) aims to extract the aspect terms from text and image pairs, and then analyze their corresponding sentiment. Recent studies typically use either a pipeline method or a unified transformer based on a cross-attention mechanism. However, these methods fail to explicitly and effectively incorporate the alignment between text and image. Supervised finetuning of the universal transformers for MABSA still requires a certain number of aligned image-text pairs. This study proposes a dual-encoder transformer with cross-modal alignment (DTCA). Two auxiliary tasks, including text-only extraction and text-patch alignment are introduced to enhance cross-attention performance. To align text and image, we propose an unsupervised approach which minimizes the Wasserstein distance between both modalities, forcing both encoders to produce more appropriate representations for the final extraction. Experimental results on two benchmarks demonstrate that DTCA consistently outperforms existing methods."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yu-etal-2022-dual">
<titleInfo>
<title>Dual-Encoder Transformers with Cross-modal Alignment for Multimodal Aspect-based Sentiment Analysis</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhewen</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jin</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liang-Chih</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuejie</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heng</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sujian</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chua-Hui</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online only</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Multimodal aspect-based sentiment analysis (MABSA) aims to extract the aspect terms from text and image pairs, and then analyze their corresponding sentiment. Recent studies typically use either a pipeline method or a unified transformer based on a cross-attention mechanism. However, these methods fail to explicitly and effectively incorporate the alignment between text and image. Supervised finetuning of the universal transformers for MABSA still requires a certain number of aligned image-text pairs. This study proposes a dual-encoder transformer with cross-modal alignment (DTCA). Two auxiliary tasks, including text-only extraction and text-patch alignment are introduced to enhance cross-attention performance. To align text and image, we propose an unsupervised approach which minimizes the Wasserstein distance between both modalities, forcing both encoders to produce more appropriate representations for the final extraction. Experimental results on two benchmarks demonstrate that DTCA consistently outperforms existing methods.</abstract>
<identifier type="citekey">yu-etal-2022-dual</identifier>
<identifier type="doi">10.18653/v1/2022.aacl-main.32</identifier>
<location>
<url>https://aclanthology.org/2022.aacl-main.32/</url>
</location>
<part>
<date>2022-11</date>
<extent unit="page">
<start>414</start>
<end>423</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Dual-Encoder Transformers with Cross-modal Alignment for Multimodal Aspect-based Sentiment Analysis
%A Yu, Zhewen
%A Wang, Jin
%A Yu, Liang-Chih
%A Zhang, Xuejie
%Y He, Yulan
%Y Ji, Heng
%Y Li, Sujian
%Y Liu, Yang
%Y Chang, Chua-Hui
%S Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)
%D 2022
%8 November
%I Association for Computational Linguistics
%C Online only
%F yu-etal-2022-dual
%X Multimodal aspect-based sentiment analysis (MABSA) aims to extract the aspect terms from text and image pairs, and then analyze their corresponding sentiment. Recent studies typically use either a pipeline method or a unified transformer based on a cross-attention mechanism. However, these methods fail to explicitly and effectively incorporate the alignment between text and image. Supervised finetuning of the universal transformers for MABSA still requires a certain number of aligned image-text pairs. This study proposes a dual-encoder transformer with cross-modal alignment (DTCA). Two auxiliary tasks, including text-only extraction and text-patch alignment are introduced to enhance cross-attention performance. To align text and image, we propose an unsupervised approach which minimizes the Wasserstein distance between both modalities, forcing both encoders to produce more appropriate representations for the final extraction. Experimental results on two benchmarks demonstrate that DTCA consistently outperforms existing methods.
%R 10.18653/v1/2022.aacl-main.32
%U https://aclanthology.org/2022.aacl-main.32/
%U https://doi.org/10.18653/v1/2022.aacl-main.32
%P 414-423
Markdown (Informal)
[Dual-Encoder Transformers with Cross-modal Alignment for Multimodal Aspect-based Sentiment Analysis](https://aclanthology.org/2022.aacl-main.32/) (Yu et al., AACL-IJCNLP 2022)
ACL