@inproceedings{guo-etal-2024-visual,
title = "Visual-Linguistic Dependency Encoding for Image-Text Retrieval",
author = "Guo, Wenxin and
Zhang, Lei and
Zhang, Kun and
Liu, Yi and
Mao, Zhendong",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.1511/",
pages = "17384--17396",
abstract = "Image-text retrieval is a fundamental task to bridge the semantic gap between natural language and vision. Recent works primarily focus on aligning textual meanings with visual appearance. However, they often overlook the semantic discrepancy caused by syntactic structure in natural language expressions and relationships among visual entities. This oversight would lead to sub-optimal alignment and degraded retrieval performance, since the underlying semantic dependencies and object interactions remain inadequately encoded in both textual and visual embeddings. In this paper, we propose a novel Visual-Linguistic Dependency Encoding (VL-DE) framework, which explicitly models the dependency information among textual words and interaction patterns between image regions, improving the discriminative power of cross-modal representations for more accurate image-text retrieval. Specifically, VL-DE enhances textual representations by considering syntactic relationships and dependency types, and visual representations by attending to its spatially neighboring regions. Cross-attention mechanism is then introduced to aggregate aligned region-word pairs into image-text similarities. Analysis on Winoground, a dataset specially designed to measure vision-linguistic compositional structure reasoning, shows that VL-DE outperforms existing methods, demonstrating its effectiveness at this task. Comprehensive experiments on two benchmarks, Flickr30K and MS-COCO, further validates the competitiveness of our approach."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="guo-etal-2024-visual">
<titleInfo>
<title>Visual-Linguistic Dependency Encoding for Image-Text Retrieval</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wenxin</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lei</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yi</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhendong</namePart>
<namePart type="family">Mao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Image-text retrieval is a fundamental task to bridge the semantic gap between natural language and vision. Recent works primarily focus on aligning textual meanings with visual appearance. However, they often overlook the semantic discrepancy caused by syntactic structure in natural language expressions and relationships among visual entities. This oversight would lead to sub-optimal alignment and degraded retrieval performance, since the underlying semantic dependencies and object interactions remain inadequately encoded in both textual and visual embeddings. In this paper, we propose a novel Visual-Linguistic Dependency Encoding (VL-DE) framework, which explicitly models the dependency information among textual words and interaction patterns between image regions, improving the discriminative power of cross-modal representations for more accurate image-text retrieval. Specifically, VL-DE enhances textual representations by considering syntactic relationships and dependency types, and visual representations by attending to its spatially neighboring regions. Cross-attention mechanism is then introduced to aggregate aligned region-word pairs into image-text similarities. Analysis on Winoground, a dataset specially designed to measure vision-linguistic compositional structure reasoning, shows that VL-DE outperforms existing methods, demonstrating its effectiveness at this task. Comprehensive experiments on two benchmarks, Flickr30K and MS-COCO, further validates the competitiveness of our approach.</abstract>
<identifier type="citekey">guo-etal-2024-visual</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.1511/</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>17384</start>
<end>17396</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Visual-Linguistic Dependency Encoding for Image-Text Retrieval
%A Guo, Wenxin
%A Zhang, Lei
%A Zhang, Kun
%A Liu, Yi
%A Mao, Zhendong
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F guo-etal-2024-visual
%X Image-text retrieval is a fundamental task to bridge the semantic gap between natural language and vision. Recent works primarily focus on aligning textual meanings with visual appearance. However, they often overlook the semantic discrepancy caused by syntactic structure in natural language expressions and relationships among visual entities. This oversight would lead to sub-optimal alignment and degraded retrieval performance, since the underlying semantic dependencies and object interactions remain inadequately encoded in both textual and visual embeddings. In this paper, we propose a novel Visual-Linguistic Dependency Encoding (VL-DE) framework, which explicitly models the dependency information among textual words and interaction patterns between image regions, improving the discriminative power of cross-modal representations for more accurate image-text retrieval. Specifically, VL-DE enhances textual representations by considering syntactic relationships and dependency types, and visual representations by attending to its spatially neighboring regions. Cross-attention mechanism is then introduced to aggregate aligned region-word pairs into image-text similarities. Analysis on Winoground, a dataset specially designed to measure vision-linguistic compositional structure reasoning, shows that VL-DE outperforms existing methods, demonstrating its effectiveness at this task. Comprehensive experiments on two benchmarks, Flickr30K and MS-COCO, further validates the competitiveness of our approach.
%U https://aclanthology.org/2024.lrec-main.1511/
%P 17384-17396
Markdown (Informal)
[Visual-Linguistic Dependency Encoding for Image-Text Retrieval](https://aclanthology.org/2024.lrec-main.1511/) (Guo et al., LREC-COLING 2024)
ACL
- Wenxin Guo, Lei Zhang, Kun Zhang, Yi Liu, and Zhendong Mao. 2024. Visual-Linguistic Dependency Encoding for Image-Text Retrieval. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), pages 17384–17396, Torino, Italia. ELRA and ICCL.