@inproceedings{hupert-etal-2022-describing,
title = "Describing Sets of Images with Textual-{PCA}",
author = "Hupert, Oded and
Schwartz, Idan and
Wolf, Lior",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2022",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-emnlp.279/",
doi = "10.18653/v1/2022.findings-emnlp.279",
pages = "3811--3821",
abstract = "We seek to semantically describe a set of images, capturing both the attributes of single images and the variations within the set. Our procedure is analogous to Principle Component Analysis, in which the role of projection vectors is replaced with generated phrases. First, a centroid phrase that has the largest average semantic similarity to the images in the set is generated, where both the computation of the similarity and the generation are based on pretrained vision-language models. Then, the phrase that generates the highest variation among the similarity scores is generated, using the same models. The next phrase maximizes the variance subject to being orthogonal, in the latent space, to the highest-variance phrase, and the process continues. Our experiments show that our method is able to convincingly capture the essence of image sets and describe the individual elements in a semantically meaningful way within the context of the entire set. Our code is available at: \url{https://github.com/OdedH/textual-pca}."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hupert-etal-2022-describing">
<titleInfo>
<title>Describing Sets of Images with Textual-PCA</title>
</titleInfo>
<name type="personal">
<namePart type="given">Oded</namePart>
<namePart type="family">Hupert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Idan</namePart>
<namePart type="family">Schwartz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lior</namePart>
<namePart type="family">Wolf</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We seek to semantically describe a set of images, capturing both the attributes of single images and the variations within the set. Our procedure is analogous to Principle Component Analysis, in which the role of projection vectors is replaced with generated phrases. First, a centroid phrase that has the largest average semantic similarity to the images in the set is generated, where both the computation of the similarity and the generation are based on pretrained vision-language models. Then, the phrase that generates the highest variation among the similarity scores is generated, using the same models. The next phrase maximizes the variance subject to being orthogonal, in the latent space, to the highest-variance phrase, and the process continues. Our experiments show that our method is able to convincingly capture the essence of image sets and describe the individual elements in a semantically meaningful way within the context of the entire set. Our code is available at: https://github.com/OdedH/textual-pca.</abstract>
<identifier type="citekey">hupert-etal-2022-describing</identifier>
<identifier type="doi">10.18653/v1/2022.findings-emnlp.279</identifier>
<location>
<url>https://aclanthology.org/2022.findings-emnlp.279/</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>3811</start>
<end>3821</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Describing Sets of Images with Textual-PCA
%A Hupert, Oded
%A Schwartz, Idan
%A Wolf, Lior
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Findings of the Association for Computational Linguistics: EMNLP 2022
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F hupert-etal-2022-describing
%X We seek to semantically describe a set of images, capturing both the attributes of single images and the variations within the set. Our procedure is analogous to Principle Component Analysis, in which the role of projection vectors is replaced with generated phrases. First, a centroid phrase that has the largest average semantic similarity to the images in the set is generated, where both the computation of the similarity and the generation are based on pretrained vision-language models. Then, the phrase that generates the highest variation among the similarity scores is generated, using the same models. The next phrase maximizes the variance subject to being orthogonal, in the latent space, to the highest-variance phrase, and the process continues. Our experiments show that our method is able to convincingly capture the essence of image sets and describe the individual elements in a semantically meaningful way within the context of the entire set. Our code is available at: https://github.com/OdedH/textual-pca.
%R 10.18653/v1/2022.findings-emnlp.279
%U https://aclanthology.org/2022.findings-emnlp.279/
%U https://doi.org/10.18653/v1/2022.findings-emnlp.279
%P 3811-3821
Markdown (Informal)
[Describing Sets of Images with Textual-PCA](https://aclanthology.org/2022.findings-emnlp.279/) (Hupert et al., Findings 2022)
ACL
- Oded Hupert, Idan Schwartz, and Lior Wolf. 2022. Describing Sets of Images with Textual-PCA. In Findings of the Association for Computational Linguistics: EMNLP 2022, pages 3811–3821, Abu Dhabi, United Arab Emirates. Association for Computational Linguistics.