@inproceedings{nag-chowdhury-etal-2021-exploiting,
title = "Exploiting Image{--}Text Synergy for Contextual Image Captioning",
author = "Nag Chowdhury, Sreyasi and
Bhowmik, Rajarshi and
Ravi, Hareesh and
de Melo, Gerard and
Razniewski, Simon and
Weikum, Gerhard",
editor = "Mosbach, Marius and
Hedderich, Michael A. and
Pezzelle, Sandro and
Mogadala, Aditya and
Klakow, Dietrich and
Moens, Marie-Francine and
Akata, Zeynep",
booktitle = "Proceedings of the Third Workshop on Beyond Vision and LANguage: inTEgrating Real-world kNowledge (LANTERN)",
month = apr,
year = "2021",
address = "Kyiv, Ukraine",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.lantern-1.3",
pages = "30--37",
abstract = "Modern web content - news articles, blog posts, educational resources, marketing brochures - is predominantly multimodal. A notable trait is the inclusion of media such as images placed at meaningful locations within a textual narrative. Most often, such images are accompanied by captions - either factual or stylistic (humorous, metaphorical, etc.) - making the narrative more engaging to the reader. While standalone image captioning has been extensively studied, captioning an image based on external knowledge such as its surrounding text remains under-explored. In this paper, we study this new task: given an image and an associated unstructured knowledge snippet, the goal is to generate a contextual caption for the image.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nag-chowdhury-etal-2021-exploiting">
<titleInfo>
<title>Exploiting Image–Text Synergy for Contextual Image Captioning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sreyasi</namePart>
<namePart type="family">Nag Chowdhury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rajarshi</namePart>
<namePart type="family">Bhowmik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hareesh</namePart>
<namePart type="family">Ravi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gerard</namePart>
<namePart type="family">de Melo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Razniewski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gerhard</namePart>
<namePart type="family">Weikum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third Workshop on Beyond Vision and LANguage: inTEgrating Real-world kNowledge (LANTERN)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marius</namePart>
<namePart type="family">Mosbach</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Hedderich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sandro</namePart>
<namePart type="family">Pezzelle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aditya</namePart>
<namePart type="family">Mogadala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dietrich</namePart>
<namePart type="family">Klakow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie-Francine</namePart>
<namePart type="family">Moens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zeynep</namePart>
<namePart type="family">Akata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Kyiv, Ukraine</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Modern web content - news articles, blog posts, educational resources, marketing brochures - is predominantly multimodal. A notable trait is the inclusion of media such as images placed at meaningful locations within a textual narrative. Most often, such images are accompanied by captions - either factual or stylistic (humorous, metaphorical, etc.) - making the narrative more engaging to the reader. While standalone image captioning has been extensively studied, captioning an image based on external knowledge such as its surrounding text remains under-explored. In this paper, we study this new task: given an image and an associated unstructured knowledge snippet, the goal is to generate a contextual caption for the image.</abstract>
<identifier type="citekey">nag-chowdhury-etal-2021-exploiting</identifier>
<location>
<url>https://aclanthology.org/2021.lantern-1.3</url>
</location>
<part>
<date>2021-04</date>
<extent unit="page">
<start>30</start>
<end>37</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Exploiting Image–Text Synergy for Contextual Image Captioning
%A Nag Chowdhury, Sreyasi
%A Bhowmik, Rajarshi
%A Ravi, Hareesh
%A de Melo, Gerard
%A Razniewski, Simon
%A Weikum, Gerhard
%Y Mosbach, Marius
%Y Hedderich, Michael A.
%Y Pezzelle, Sandro
%Y Mogadala, Aditya
%Y Klakow, Dietrich
%Y Moens, Marie-Francine
%Y Akata, Zeynep
%S Proceedings of the Third Workshop on Beyond Vision and LANguage: inTEgrating Real-world kNowledge (LANTERN)
%D 2021
%8 April
%I Association for Computational Linguistics
%C Kyiv, Ukraine
%F nag-chowdhury-etal-2021-exploiting
%X Modern web content - news articles, blog posts, educational resources, marketing brochures - is predominantly multimodal. A notable trait is the inclusion of media such as images placed at meaningful locations within a textual narrative. Most often, such images are accompanied by captions - either factual or stylistic (humorous, metaphorical, etc.) - making the narrative more engaging to the reader. While standalone image captioning has been extensively studied, captioning an image based on external knowledge such as its surrounding text remains under-explored. In this paper, we study this new task: given an image and an associated unstructured knowledge snippet, the goal is to generate a contextual caption for the image.
%U https://aclanthology.org/2021.lantern-1.3
%P 30-37
Markdown (Informal)
[Exploiting Image–Text Synergy for Contextual Image Captioning](https://aclanthology.org/2021.lantern-1.3) (Nag Chowdhury et al., LANTERN 2021)
ACL
- Sreyasi Nag Chowdhury, Rajarshi Bhowmik, Hareesh Ravi, Gerard de Melo, Simon Razniewski, and Gerhard Weikum. 2021. Exploiting Image–Text Synergy for Contextual Image Captioning. In Proceedings of the Third Workshop on Beyond Vision and LANguage: inTEgrating Real-world kNowledge (LANTERN), pages 30–37, Kyiv, Ukraine. Association for Computational Linguistics.