@inproceedings{castro-etal-2023-scalable,
title = "Scalable Performance Analysis for Vision-Language Models",
author = "Castro, Santiago and
Ignat, Oana and
Mihalcea, Rada",
editor = "Palmer, Alexis and
Camacho-collados, Jose",
booktitle = "Proceedings of the 12th Joint Conference on Lexical and Computational Semantics (*SEM 2023)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.starsem-1.26/",
doi = "10.18653/v1/2023.starsem-1.26",
pages = "284--294",
abstract = "Joint vision-language models have shown great performance over a diverse set of tasks. However, little is known about their limitations, as the high dimensional space learned by these models makes it difficult to identify semantic errors. Recent work has addressed this problem by designing highly controlled probing task benchmarks. Our paper introduces a more scalable solution that relies on already annotated benchmarks. Our method consists of extracting a large set of diverse features from a vision-language benchmark and measuring their correlation with the output of the target model. We confirm previous findings that CLIP behaves like a bag of words model and performs better with nouns and verbs; we also uncover novel insights such as CLIP getting confused by concrete words. Our framework is available at \url{https://github.com/MichiganNLP/Scalable-VLM-Probing} and can be used with other multimodal models and benchmarks."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="castro-etal-2023-scalable">
<titleInfo>
<title>Scalable Performance Analysis for Vision-Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Santiago</namePart>
<namePart type="family">Castro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oana</namePart>
<namePart type="family">Ignat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rada</namePart>
<namePart type="family">Mihalcea</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 12th Joint Conference on Lexical and Computational Semantics (*SEM 2023)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alexis</namePart>
<namePart type="family">Palmer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jose</namePart>
<namePart type="family">Camacho-collados</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Joint vision-language models have shown great performance over a diverse set of tasks. However, little is known about their limitations, as the high dimensional space learned by these models makes it difficult to identify semantic errors. Recent work has addressed this problem by designing highly controlled probing task benchmarks. Our paper introduces a more scalable solution that relies on already annotated benchmarks. Our method consists of extracting a large set of diverse features from a vision-language benchmark and measuring their correlation with the output of the target model. We confirm previous findings that CLIP behaves like a bag of words model and performs better with nouns and verbs; we also uncover novel insights such as CLIP getting confused by concrete words. Our framework is available at https://github.com/MichiganNLP/Scalable-VLM-Probing and can be used with other multimodal models and benchmarks.</abstract>
<identifier type="citekey">castro-etal-2023-scalable</identifier>
<identifier type="doi">10.18653/v1/2023.starsem-1.26</identifier>
<location>
<url>https://aclanthology.org/2023.starsem-1.26/</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>284</start>
<end>294</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Scalable Performance Analysis for Vision-Language Models
%A Castro, Santiago
%A Ignat, Oana
%A Mihalcea, Rada
%Y Palmer, Alexis
%Y Camacho-collados, Jose
%S Proceedings of the 12th Joint Conference on Lexical and Computational Semantics (*SEM 2023)
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F castro-etal-2023-scalable
%X Joint vision-language models have shown great performance over a diverse set of tasks. However, little is known about their limitations, as the high dimensional space learned by these models makes it difficult to identify semantic errors. Recent work has addressed this problem by designing highly controlled probing task benchmarks. Our paper introduces a more scalable solution that relies on already annotated benchmarks. Our method consists of extracting a large set of diverse features from a vision-language benchmark and measuring their correlation with the output of the target model. We confirm previous findings that CLIP behaves like a bag of words model and performs better with nouns and verbs; we also uncover novel insights such as CLIP getting confused by concrete words. Our framework is available at https://github.com/MichiganNLP/Scalable-VLM-Probing and can be used with other multimodal models and benchmarks.
%R 10.18653/v1/2023.starsem-1.26
%U https://aclanthology.org/2023.starsem-1.26/
%U https://doi.org/10.18653/v1/2023.starsem-1.26
%P 284-294
Markdown (Informal)
[Scalable Performance Analysis for Vision-Language Models](https://aclanthology.org/2023.starsem-1.26/) (Castro et al., *SEM 2023)
ACL
- Santiago Castro, Oana Ignat, and Rada Mihalcea. 2023. Scalable Performance Analysis for Vision-Language Models. In Proceedings of the 12th Joint Conference on Lexical and Computational Semantics (*SEM 2023), pages 284–294, Toronto, Canada. Association for Computational Linguistics.