@inproceedings{donmez-etal-2023-hnc,
title = "{HNC}: Leveraging Hard Negative Captions towards Models with Fine-Grained Visual-Linguistic Comprehension Capabilities",
author = {D{\"o}nmez, Esra and
Tilli, Pascal and
Yang, Hsiu-Yu and
Vu, Ngoc Thang and
Silberer, Carina},
editor = "Jiang, Jing and
Reitter, David and
Deng, Shumin",
booktitle = "Proceedings of the 27th Conference on Computational Natural Language Learning (CoNLL)",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.conll-1.24",
doi = "10.18653/v1/2023.conll-1.24",
pages = "364--388",
abstract = "Image-Text-Matching (ITM) is one of the defacto methods of learning generalized representations from a large corpus in Vision and Language (VL). However, due to the weak association between the web-collected image{--}text pairs, models fail to show fine-grained understanding of the combined semantics of these modalities. To this end, we propose Hard Negative Captions (HNC): an automatically created dataset containing foiled hard negative captions for ITM training towards achieving fine-grained cross-modal comprehension in VL. Additionally, we provide a challenging manually-created test set for benchmarking models on a fine-grained cross-modal mismatch with varying levels of compositional complexity. Our results show the effectiveness of training on HNC by improving the models{'} zero-shot capabilities in detecting mismatches on diagnostic tasks and performing robustly under noisy visual input scenarios. Also, we demonstrate that HNC models yield a comparable or better initialization for fine-tuning. Our code and data are publicly available.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="donmez-etal-2023-hnc">
<titleInfo>
<title>HNC: Leveraging Hard Negative Captions towards Models with Fine-Grained Visual-Linguistic Comprehension Capabilities</title>
</titleInfo>
<name type="personal">
<namePart type="given">Esra</namePart>
<namePart type="family">Dönmez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pascal</namePart>
<namePart type="family">Tilli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hsiu-Yu</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ngoc</namePart>
<namePart type="given">Thang</namePart>
<namePart type="family">Vu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carina</namePart>
<namePart type="family">Silberer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 27th Conference on Computational Natural Language Learning (CoNLL)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Reitter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shumin</namePart>
<namePart type="family">Deng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Image-Text-Matching (ITM) is one of the defacto methods of learning generalized representations from a large corpus in Vision and Language (VL). However, due to the weak association between the web-collected image–text pairs, models fail to show fine-grained understanding of the combined semantics of these modalities. To this end, we propose Hard Negative Captions (HNC): an automatically created dataset containing foiled hard negative captions for ITM training towards achieving fine-grained cross-modal comprehension in VL. Additionally, we provide a challenging manually-created test set for benchmarking models on a fine-grained cross-modal mismatch with varying levels of compositional complexity. Our results show the effectiveness of training on HNC by improving the models’ zero-shot capabilities in detecting mismatches on diagnostic tasks and performing robustly under noisy visual input scenarios. Also, we demonstrate that HNC models yield a comparable or better initialization for fine-tuning. Our code and data are publicly available.</abstract>
<identifier type="citekey">donmez-etal-2023-hnc</identifier>
<identifier type="doi">10.18653/v1/2023.conll-1.24</identifier>
<location>
<url>https://aclanthology.org/2023.conll-1.24</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>364</start>
<end>388</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T HNC: Leveraging Hard Negative Captions towards Models with Fine-Grained Visual-Linguistic Comprehension Capabilities
%A Dönmez, Esra
%A Tilli, Pascal
%A Yang, Hsiu-Yu
%A Vu, Ngoc Thang
%A Silberer, Carina
%Y Jiang, Jing
%Y Reitter, David
%Y Deng, Shumin
%S Proceedings of the 27th Conference on Computational Natural Language Learning (CoNLL)
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F donmez-etal-2023-hnc
%X Image-Text-Matching (ITM) is one of the defacto methods of learning generalized representations from a large corpus in Vision and Language (VL). However, due to the weak association between the web-collected image–text pairs, models fail to show fine-grained understanding of the combined semantics of these modalities. To this end, we propose Hard Negative Captions (HNC): an automatically created dataset containing foiled hard negative captions for ITM training towards achieving fine-grained cross-modal comprehension in VL. Additionally, we provide a challenging manually-created test set for benchmarking models on a fine-grained cross-modal mismatch with varying levels of compositional complexity. Our results show the effectiveness of training on HNC by improving the models’ zero-shot capabilities in detecting mismatches on diagnostic tasks and performing robustly under noisy visual input scenarios. Also, we demonstrate that HNC models yield a comparable or better initialization for fine-tuning. Our code and data are publicly available.
%R 10.18653/v1/2023.conll-1.24
%U https://aclanthology.org/2023.conll-1.24
%U https://doi.org/10.18653/v1/2023.conll-1.24
%P 364-388
Markdown (Informal)
[HNC: Leveraging Hard Negative Captions towards Models with Fine-Grained Visual-Linguistic Comprehension Capabilities](https://aclanthology.org/2023.conll-1.24) (Dönmez et al., CoNLL 2023)
ACL