@inproceedings{jiang-etal-2020-visual,
title = "Visual-Textual Alignment for Graph Inference in Visual Dialog",
author = "Jiang, Tianling and
Ji, Yi and
Liu, Chunping and
Shao, Hailin",
editor = "Scott, Donia and
Bel, Nuria and
Zong, Chengqing",
booktitle = "Proceedings of the 28th International Conference on Computational Linguistics",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2020.coling-main.170",
doi = "10.18653/v1/2020.coling-main.170",
pages = "1874--1885",
abstract = "As a conversational intelligence task, visual dialog entails answering a series of questions grounded in an image, using the dialog history as context. To generate correct answers, the comprehension of the semantic dependencies among implicit visual and textual contents is critical. Prior works usually ignored the underlying relation and failed to infer it reasonably. In this paper, we propose a Visual-Textual Alignment for Graph Inference (VTAGI) network. Compared with other approaches, it makes up the lack of structural inference in visual dialog. The whole system consists of two modules, Visual and Textual Alignment (VTA) and Visual Graph Attended by Text (VGAT). Specially, the VTA module aims at representing an image with a set of integrated visual regions and corresponding textual concepts, reflecting certain semantics. The VGAT module views the visual features with semantic information as observed nodes and each node learns the relationship with others in visual graph. We also qualitatively and quantitatively evaluate the model on VisDial v1.0 dataset, showing our VTAGI outperforms previous state-of-the-art models.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jiang-etal-2020-visual">
<titleInfo>
<title>Visual-Textual Alignment for Graph Inference in Visual Dialog</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tianling</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yi</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chunping</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hailin</namePart>
<namePart type="family">Shao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 28th International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Donia</namePart>
<namePart type="family">Scott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nuria</namePart>
<namePart type="family">Bel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chengqing</namePart>
<namePart type="family">Zong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Barcelona, Spain (Online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>As a conversational intelligence task, visual dialog entails answering a series of questions grounded in an image, using the dialog history as context. To generate correct answers, the comprehension of the semantic dependencies among implicit visual and textual contents is critical. Prior works usually ignored the underlying relation and failed to infer it reasonably. In this paper, we propose a Visual-Textual Alignment for Graph Inference (VTAGI) network. Compared with other approaches, it makes up the lack of structural inference in visual dialog. The whole system consists of two modules, Visual and Textual Alignment (VTA) and Visual Graph Attended by Text (VGAT). Specially, the VTA module aims at representing an image with a set of integrated visual regions and corresponding textual concepts, reflecting certain semantics. The VGAT module views the visual features with semantic information as observed nodes and each node learns the relationship with others in visual graph. We also qualitatively and quantitatively evaluate the model on VisDial v1.0 dataset, showing our VTAGI outperforms previous state-of-the-art models.</abstract>
<identifier type="citekey">jiang-etal-2020-visual</identifier>
<identifier type="doi">10.18653/v1/2020.coling-main.170</identifier>
<location>
<url>https://aclanthology.org/2020.coling-main.170</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>1874</start>
<end>1885</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Visual-Textual Alignment for Graph Inference in Visual Dialog
%A Jiang, Tianling
%A Ji, Yi
%A Liu, Chunping
%A Shao, Hailin
%Y Scott, Donia
%Y Bel, Nuria
%Y Zong, Chengqing
%S Proceedings of the 28th International Conference on Computational Linguistics
%D 2020
%8 December
%I International Committee on Computational Linguistics
%C Barcelona, Spain (Online)
%F jiang-etal-2020-visual
%X As a conversational intelligence task, visual dialog entails answering a series of questions grounded in an image, using the dialog history as context. To generate correct answers, the comprehension of the semantic dependencies among implicit visual and textual contents is critical. Prior works usually ignored the underlying relation and failed to infer it reasonably. In this paper, we propose a Visual-Textual Alignment for Graph Inference (VTAGI) network. Compared with other approaches, it makes up the lack of structural inference in visual dialog. The whole system consists of two modules, Visual and Textual Alignment (VTA) and Visual Graph Attended by Text (VGAT). Specially, the VTA module aims at representing an image with a set of integrated visual regions and corresponding textual concepts, reflecting certain semantics. The VGAT module views the visual features with semantic information as observed nodes and each node learns the relationship with others in visual graph. We also qualitatively and quantitatively evaluate the model on VisDial v1.0 dataset, showing our VTAGI outperforms previous state-of-the-art models.
%R 10.18653/v1/2020.coling-main.170
%U https://aclanthology.org/2020.coling-main.170
%U https://doi.org/10.18653/v1/2020.coling-main.170
%P 1874-1885
Markdown (Informal)
[Visual-Textual Alignment for Graph Inference in Visual Dialog](https://aclanthology.org/2020.coling-main.170) (Jiang et al., COLING 2020)
ACL