@inproceedings{sui-etal-2024-melov,
title = "{MELOV}: Multimodal Entity Linking with Optimized Visual Features in Latent Space",
author = "Sui, Xuhui and
Zhang, Ying and
Zhao, Yu and
Song, Kehui and
Zhou, Baohang and
Yuan, Xiaojie",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-acl.46/",
doi = "10.18653/v1/2024.findings-acl.46",
pages = "816--826",
abstract = "Multimodal entity linking (MEL), which aligns ambiguous mentions within multimodal contexts to referent entities from multimodal knowledge bases, is essential for many natural language processing applications. Previous MEL methods mainly focus on exploring complex multimodal interaction mechanisms to better capture coherence evidence between mentions and entities by mining complementary information. However, in real-world social media scenarios, vision modality often exhibits low quality, low value, or low relevance to the mention. Integrating such information directly will backfire, leading to a weakened consistency between mentions and their corresponding entities. In this paper, we propose a novel latent space vision feature optimization framework MELOV, which combines inter-modality and intra-modality optimizations to address these challenges. For the inter-modality optimization, we exploit the variational autoencoder to mine shared information and generate text-based visual features. For the intra-modality optimization, we consider the relationships between mentions and build graph convolutional network to aggregate the visual features of semantic similar neighbors. Extensive experiments on three benchmark datasets demonstrate the superiority of our proposed framework."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sui-etal-2024-melov">
<titleInfo>
<title>MELOV: Multimodal Entity Linking with Optimized Visual Features in Latent Space</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xuhui</namePart>
<namePart type="family">Sui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ying</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yu</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kehui</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Baohang</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaojie</namePart>
<namePart type="family">Yuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Multimodal entity linking (MEL), which aligns ambiguous mentions within multimodal contexts to referent entities from multimodal knowledge bases, is essential for many natural language processing applications. Previous MEL methods mainly focus on exploring complex multimodal interaction mechanisms to better capture coherence evidence between mentions and entities by mining complementary information. However, in real-world social media scenarios, vision modality often exhibits low quality, low value, or low relevance to the mention. Integrating such information directly will backfire, leading to a weakened consistency between mentions and their corresponding entities. In this paper, we propose a novel latent space vision feature optimization framework MELOV, which combines inter-modality and intra-modality optimizations to address these challenges. For the inter-modality optimization, we exploit the variational autoencoder to mine shared information and generate text-based visual features. For the intra-modality optimization, we consider the relationships between mentions and build graph convolutional network to aggregate the visual features of semantic similar neighbors. Extensive experiments on three benchmark datasets demonstrate the superiority of our proposed framework.</abstract>
<identifier type="citekey">sui-etal-2024-melov</identifier>
<identifier type="doi">10.18653/v1/2024.findings-acl.46</identifier>
<location>
<url>https://aclanthology.org/2024.findings-acl.46/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>816</start>
<end>826</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MELOV: Multimodal Entity Linking with Optimized Visual Features in Latent Space
%A Sui, Xuhui
%A Zhang, Ying
%A Zhao, Yu
%A Song, Kehui
%A Zhou, Baohang
%A Yuan, Xiaojie
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Findings of the Association for Computational Linguistics: ACL 2024
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F sui-etal-2024-melov
%X Multimodal entity linking (MEL), which aligns ambiguous mentions within multimodal contexts to referent entities from multimodal knowledge bases, is essential for many natural language processing applications. Previous MEL methods mainly focus on exploring complex multimodal interaction mechanisms to better capture coherence evidence between mentions and entities by mining complementary information. However, in real-world social media scenarios, vision modality often exhibits low quality, low value, or low relevance to the mention. Integrating such information directly will backfire, leading to a weakened consistency between mentions and their corresponding entities. In this paper, we propose a novel latent space vision feature optimization framework MELOV, which combines inter-modality and intra-modality optimizations to address these challenges. For the inter-modality optimization, we exploit the variational autoencoder to mine shared information and generate text-based visual features. For the intra-modality optimization, we consider the relationships between mentions and build graph convolutional network to aggregate the visual features of semantic similar neighbors. Extensive experiments on three benchmark datasets demonstrate the superiority of our proposed framework.
%R 10.18653/v1/2024.findings-acl.46
%U https://aclanthology.org/2024.findings-acl.46/
%U https://doi.org/10.18653/v1/2024.findings-acl.46
%P 816-826
Markdown (Informal)
[MELOV: Multimodal Entity Linking with Optimized Visual Features in Latent Space](https://aclanthology.org/2024.findings-acl.46/) (Sui et al., Findings 2024)
ACL