@inproceedings{liang-etal-2021-graghvqa,
title = "{G}ragh{VQA}: Language-Guided Graph Neural Networks for Graph-based Visual Question Answering",
author = "Liang, Weixin and
Jiang, Yanhao and
Liu, Zixuan",
editor = "Zadeh, Amir and
Morency, Louis-Philippe and
Liang, Paul Pu and
Ross, Candace and
Salakhutdinov, Ruslan and
Poria, Soujanya and
Cambria, Erik and
Shi, Kelly",
booktitle = "Proceedings of the Third Workshop on Multimodal Artificial Intelligence",
month = jun,
year = "2021",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.maiworkshop-1.12",
doi = "10.18653/v1/2021.maiworkshop-1.12",
pages = "79--86",
abstract = "Images are more than a collection of objects or attributes {---} they represent a web of relationships among interconnected objects. Scene Graph has emerged as a new modality as a structured graphical representation of images. Scene Graph encodes objects as nodes connected via pairwise relations as edges. To support question answering on scene graphs, we propose GraphVQA, a language-guided graph neural network framework that translates and executes a natural language question as multiple iterations of message passing among graph nodes. We explore the design space of GraphVQA framework, and discuss the trade-off of different design choices. Our experiments on GQA dataset show that GraphVQA outperforms the state-of-the-art accuracy by a large margin (88.43{\%} vs. 94.78{\%}).",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liang-etal-2021-graghvqa">
<titleInfo>
<title>GraghVQA: Language-Guided Graph Neural Networks for Graph-based Visual Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Weixin</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yanhao</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zixuan</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third Workshop on Multimodal Artificial Intelligence</title>
</titleInfo>
<name type="personal">
<namePart type="given">Amir</namePart>
<namePart type="family">Zadeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Louis-Philippe</namePart>
<namePart type="family">Morency</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="given">Pu</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Candace</namePart>
<namePart type="family">Ross</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Salakhutdinov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Soujanya</namePart>
<namePart type="family">Poria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Erik</namePart>
<namePart type="family">Cambria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kelly</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Images are more than a collection of objects or attributes — they represent a web of relationships among interconnected objects. Scene Graph has emerged as a new modality as a structured graphical representation of images. Scene Graph encodes objects as nodes connected via pairwise relations as edges. To support question answering on scene graphs, we propose GraphVQA, a language-guided graph neural network framework that translates and executes a natural language question as multiple iterations of message passing among graph nodes. We explore the design space of GraphVQA framework, and discuss the trade-off of different design choices. Our experiments on GQA dataset show that GraphVQA outperforms the state-of-the-art accuracy by a large margin (88.43% vs. 94.78%).</abstract>
<identifier type="citekey">liang-etal-2021-graghvqa</identifier>
<identifier type="doi">10.18653/v1/2021.maiworkshop-1.12</identifier>
<location>
<url>https://aclanthology.org/2021.maiworkshop-1.12</url>
</location>
<part>
<date>2021-06</date>
<extent unit="page">
<start>79</start>
<end>86</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T GraghVQA: Language-Guided Graph Neural Networks for Graph-based Visual Question Answering
%A Liang, Weixin
%A Jiang, Yanhao
%A Liu, Zixuan
%Y Zadeh, Amir
%Y Morency, Louis-Philippe
%Y Liang, Paul Pu
%Y Ross, Candace
%Y Salakhutdinov, Ruslan
%Y Poria, Soujanya
%Y Cambria, Erik
%Y Shi, Kelly
%S Proceedings of the Third Workshop on Multimodal Artificial Intelligence
%D 2021
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F liang-etal-2021-graghvqa
%X Images are more than a collection of objects or attributes — they represent a web of relationships among interconnected objects. Scene Graph has emerged as a new modality as a structured graphical representation of images. Scene Graph encodes objects as nodes connected via pairwise relations as edges. To support question answering on scene graphs, we propose GraphVQA, a language-guided graph neural network framework that translates and executes a natural language question as multiple iterations of message passing among graph nodes. We explore the design space of GraphVQA framework, and discuss the trade-off of different design choices. Our experiments on GQA dataset show that GraphVQA outperforms the state-of-the-art accuracy by a large margin (88.43% vs. 94.78%).
%R 10.18653/v1/2021.maiworkshop-1.12
%U https://aclanthology.org/2021.maiworkshop-1.12
%U https://doi.org/10.18653/v1/2021.maiworkshop-1.12
%P 79-86
Markdown (Informal)
[GraghVQA: Language-Guided Graph Neural Networks for Graph-based Visual Question Answering](https://aclanthology.org/2021.maiworkshop-1.12) (Liang et al., maiworkshop 2021)
ACL