@inproceedings{zhao-etal-2022-visual,
title = "Visual Spatial Description: Controlled Spatial-Oriented Image-to-Text Generation",
author = "Zhao, Yu and
Wei, Jianguo and
Lin, ZhiChao and
Sun, Yueheng and
Zhang, Meishan and
Zhang, Min",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.emnlp-main.93",
doi = "10.18653/v1/2022.emnlp-main.93",
pages = "1437--1449",
abstract = "Image-to-text tasks such as open-ended image captioning and controllable image description have received extensive attention for decades. Here we advance this line of work further, presenting Visual Spatial Description (VSD), a new perspective for image-to-text toward spatial semantics. Given an image and two objects inside it, VSD aims to produce one description focusing on the spatial perspective between the two objects. Accordingly, we annotate a dataset manually to facilitate the investigation of the newly-introduced task, and then build several benchmark encoder-decoder models by using VL-BART and VL-T5 as backbones. In addition, we investigate visual spatial relationship classification (VSRC) information into our model by pipeline and end-to-end architectures. Finally, we conduct experiments on our benchmark dataset to evaluate all our models. Results show that our models are awe-inspiring, offering accurate and human-like spatial-oriented text descriptions. Besides, VSRC has great potential for VSD, and the joint end-to-end architecture is the better choice for their integration. We will make the dataset and codes publicly available for research purposes.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhao-etal-2022-visual">
<titleInfo>
<title>Visual Spatial Description: Controlled Spatial-Oriented Image-to-Text Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yu</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jianguo</namePart>
<namePart type="family">Wei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">ZhiChao</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yueheng</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Meishan</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Image-to-text tasks such as open-ended image captioning and controllable image description have received extensive attention for decades. Here we advance this line of work further, presenting Visual Spatial Description (VSD), a new perspective for image-to-text toward spatial semantics. Given an image and two objects inside it, VSD aims to produce one description focusing on the spatial perspective between the two objects. Accordingly, we annotate a dataset manually to facilitate the investigation of the newly-introduced task, and then build several benchmark encoder-decoder models by using VL-BART and VL-T5 as backbones. In addition, we investigate visual spatial relationship classification (VSRC) information into our model by pipeline and end-to-end architectures. Finally, we conduct experiments on our benchmark dataset to evaluate all our models. Results show that our models are awe-inspiring, offering accurate and human-like spatial-oriented text descriptions. Besides, VSRC has great potential for VSD, and the joint end-to-end architecture is the better choice for their integration. We will make the dataset and codes publicly available for research purposes.</abstract>
<identifier type="citekey">zhao-etal-2022-visual</identifier>
<identifier type="doi">10.18653/v1/2022.emnlp-main.93</identifier>
<location>
<url>https://aclanthology.org/2022.emnlp-main.93</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>1437</start>
<end>1449</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Visual Spatial Description: Controlled Spatial-Oriented Image-to-Text Generation
%A Zhao, Yu
%A Wei, Jianguo
%A Lin, ZhiChao
%A Sun, Yueheng
%A Zhang, Meishan
%A Zhang, Min
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F zhao-etal-2022-visual
%X Image-to-text tasks such as open-ended image captioning and controllable image description have received extensive attention for decades. Here we advance this line of work further, presenting Visual Spatial Description (VSD), a new perspective for image-to-text toward spatial semantics. Given an image and two objects inside it, VSD aims to produce one description focusing on the spatial perspective between the two objects. Accordingly, we annotate a dataset manually to facilitate the investigation of the newly-introduced task, and then build several benchmark encoder-decoder models by using VL-BART and VL-T5 as backbones. In addition, we investigate visual spatial relationship classification (VSRC) information into our model by pipeline and end-to-end architectures. Finally, we conduct experiments on our benchmark dataset to evaluate all our models. Results show that our models are awe-inspiring, offering accurate and human-like spatial-oriented text descriptions. Besides, VSRC has great potential for VSD, and the joint end-to-end architecture is the better choice for their integration. We will make the dataset and codes publicly available for research purposes.
%R 10.18653/v1/2022.emnlp-main.93
%U https://aclanthology.org/2022.emnlp-main.93
%U https://doi.org/10.18653/v1/2022.emnlp-main.93
%P 1437-1449
Markdown (Informal)
[Visual Spatial Description: Controlled Spatial-Oriented Image-to-Text Generation](https://aclanthology.org/2022.emnlp-main.93) (Zhao et al., EMNLP 2022)
ACL