@inproceedings{maeda-etal-2023-query,
title = "Query-based Image Captioning from Multi-context 360$cdegree$ Images",
author = "Maeda, Koki and
Kurita, Shuhei and
Miyanishi, Taiki and
Okazaki, Naoaki",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-emnlp.463/",
doi = "10.18653/v1/2023.findings-emnlp.463",
pages = "6940--6954",
abstract = "A 360-degree image captures the entire scene without the limitations of a camera`s field of view, which makes it difficult to describe all the contexts in a single caption. We propose a novel task called Query-based Image Captioning (QuIC) for 360-degree images, where a query (words or short phrases) specifies the context to describe. This task is more challenging than the conventional image captioning task, which describes salient objects in images, as it requires fine-grained scene understanding to select the contents consistent with user`s intent based on the query. We construct a dataset for the new task that comprises 3,940 360-degree images and 18,459 pairs of queries and captions annotated manually. Experiments demonstrate that fine-tuning image captioning models further on our dataset can generate more diverse and controllable captions from multiple contexts of 360-degree images."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="maeda-etal-2023-query">
<titleInfo>
<title>Query-based Image Captioning from Multi-context 360cdegree Images</title>
</titleInfo>
<name type="personal">
<namePart type="given">Koki</namePart>
<namePart type="family">Maeda</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuhei</namePart>
<namePart type="family">Kurita</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Taiki</namePart>
<namePart type="family">Miyanishi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>A 360-degree image captures the entire scene without the limitations of a camera‘s field of view, which makes it difficult to describe all the contexts in a single caption. We propose a novel task called Query-based Image Captioning (QuIC) for 360-degree images, where a query (words or short phrases) specifies the context to describe. This task is more challenging than the conventional image captioning task, which describes salient objects in images, as it requires fine-grained scene understanding to select the contents consistent with user‘s intent based on the query. We construct a dataset for the new task that comprises 3,940 360-degree images and 18,459 pairs of queries and captions annotated manually. Experiments demonstrate that fine-tuning image captioning models further on our dataset can generate more diverse and controllable captions from multiple contexts of 360-degree images.</abstract>
<identifier type="citekey">maeda-etal-2023-query</identifier>
<identifier type="doi">10.18653/v1/2023.findings-emnlp.463</identifier>
<location>
<url>https://aclanthology.org/2023.findings-emnlp.463/</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>6940</start>
<end>6954</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Query-based Image Captioning from Multi-context 360cdegree Images
%A Maeda, Koki
%A Kurita, Shuhei
%A Miyanishi, Taiki
%A Okazaki, Naoaki
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Findings of the Association for Computational Linguistics: EMNLP 2023
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F maeda-etal-2023-query
%X A 360-degree image captures the entire scene without the limitations of a camera‘s field of view, which makes it difficult to describe all the contexts in a single caption. We propose a novel task called Query-based Image Captioning (QuIC) for 360-degree images, where a query (words or short phrases) specifies the context to describe. This task is more challenging than the conventional image captioning task, which describes salient objects in images, as it requires fine-grained scene understanding to select the contents consistent with user‘s intent based on the query. We construct a dataset for the new task that comprises 3,940 360-degree images and 18,459 pairs of queries and captions annotated manually. Experiments demonstrate that fine-tuning image captioning models further on our dataset can generate more diverse and controllable captions from multiple contexts of 360-degree images.
%R 10.18653/v1/2023.findings-emnlp.463
%U https://aclanthology.org/2023.findings-emnlp.463/
%U https://doi.org/10.18653/v1/2023.findings-emnlp.463
%P 6940-6954
Markdown (Informal)
[Query-based Image Captioning from Multi-context 360cdegree Images](https://aclanthology.org/2023.findings-emnlp.463/) (Maeda et al., Findings 2023)
ACL