@inproceedings{gao-etal-2024-high,
title = "High-Order Semantic Alignment for Unsupervised Fine-Grained Image-Text Retrieval",
author = "Gao, Rui and
Cheng, Miaomiao and
Han, Xu and
Song, Wei",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.714",
pages = "8155--8165",
abstract = "Cross-modal retrieval is an important yet challenging task due to the semantic discrepancy between visual content and language. To measure the correlation between images and text, most existing research mainly focuses on learning global or local correspondence, failing to explore fine-grained local-global alignment. To infer more accurate similarity scores, we introduce a novel High Order Semantic Alignment (HOSA) model that can provide complementary and comprehensive semantic clues. Specifically, to jointly learn global and local alignment and emphasize local-global interaction, we employ tensor-product (t-product) operation to reconstruct one modal{'}s representation based on another modal{'}s information in a common semantic space. Such a cross-modal reconstruction strategy would significantly enhance inter-modal correlation learning in a fine-grained manner. Extensive experiments on two benchmark datasets validate that our model significantly outperforms several state-of-the-art baselines, especially in retrieving the most relevant results.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gao-etal-2024-high">
<titleInfo>
<title>High-Order Semantic Alignment for Unsupervised Fine-Grained Image-Text Retrieval</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rui</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miaomiao</namePart>
<namePart type="family">Cheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xu</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Cross-modal retrieval is an important yet challenging task due to the semantic discrepancy between visual content and language. To measure the correlation between images and text, most existing research mainly focuses on learning global or local correspondence, failing to explore fine-grained local-global alignment. To infer more accurate similarity scores, we introduce a novel High Order Semantic Alignment (HOSA) model that can provide complementary and comprehensive semantic clues. Specifically, to jointly learn global and local alignment and emphasize local-global interaction, we employ tensor-product (t-product) operation to reconstruct one modal’s representation based on another modal’s information in a common semantic space. Such a cross-modal reconstruction strategy would significantly enhance inter-modal correlation learning in a fine-grained manner. Extensive experiments on two benchmark datasets validate that our model significantly outperforms several state-of-the-art baselines, especially in retrieving the most relevant results.</abstract>
<identifier type="citekey">gao-etal-2024-high</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.714</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>8155</start>
<end>8165</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T High-Order Semantic Alignment for Unsupervised Fine-Grained Image-Text Retrieval
%A Gao, Rui
%A Cheng, Miaomiao
%A Han, Xu
%A Song, Wei
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F gao-etal-2024-high
%X Cross-modal retrieval is an important yet challenging task due to the semantic discrepancy between visual content and language. To measure the correlation between images and text, most existing research mainly focuses on learning global or local correspondence, failing to explore fine-grained local-global alignment. To infer more accurate similarity scores, we introduce a novel High Order Semantic Alignment (HOSA) model that can provide complementary and comprehensive semantic clues. Specifically, to jointly learn global and local alignment and emphasize local-global interaction, we employ tensor-product (t-product) operation to reconstruct one modal’s representation based on another modal’s information in a common semantic space. Such a cross-modal reconstruction strategy would significantly enhance inter-modal correlation learning in a fine-grained manner. Extensive experiments on two benchmark datasets validate that our model significantly outperforms several state-of-the-art baselines, especially in retrieving the most relevant results.
%U https://aclanthology.org/2024.lrec-main.714
%P 8155-8165
Markdown (Informal)
[High-Order Semantic Alignment for Unsupervised Fine-Grained Image-Text Retrieval](https://aclanthology.org/2024.lrec-main.714) (Gao et al., LREC-COLING 2024)
ACL