@inproceedings{garderes-etal-2020-conceptbert,
title = "{C}oncept{B}ert: Concept-Aware Representation for Visual Question Answering",
author = "Gard{\`e}res, Fran{\c{c}}ois and
Ziaeefard, Maryam and
Abeloos, Baptiste and
Lecue, Freddy",
editor = "Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2020",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.findings-emnlp.44/",
doi = "10.18653/v1/2020.findings-emnlp.44",
pages = "489--498",
abstract = "Visual Question Answering (VQA) is a challenging task that has received increasing attention from both the computer vision and the natural language processing communities. A VQA model combines visual and textual features in order to answer questions grounded in an image. Current works in VQA focus on questions which are answerable by direct analysis of the question and image alone. We present a concept-aware algorithm, ConceptBert, for questions which require common sense, or basic factual knowledge from external structured content. Given an image and a question in natural language, ConceptBert requires visual elements of the image and a Knowledge Graph (KG) to infer the correct answer. We introduce a multi-modal representation which learns a joint Concept-Vision-Language embedding inspired by the popular BERT architecture. We exploit ConceptNet KG for encoding the common sense knowledge and evaluate our methodology on the Outside Knowledge-VQA (OK-VQA) and VQA datasets."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="garderes-etal-2020-conceptbert">
<titleInfo>
<title>ConceptBert: Concept-Aware Representation for Visual Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">François</namePart>
<namePart type="family">Gardères</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maryam</namePart>
<namePart type="family">Ziaeefard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Baptiste</namePart>
<namePart type="family">Abeloos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Freddy</namePart>
<namePart type="family">Lecue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2020</title>
</titleInfo>
<name type="personal">
<namePart type="given">Trevor</namePart>
<namePart type="family">Cohn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Visual Question Answering (VQA) is a challenging task that has received increasing attention from both the computer vision and the natural language processing communities. A VQA model combines visual and textual features in order to answer questions grounded in an image. Current works in VQA focus on questions which are answerable by direct analysis of the question and image alone. We present a concept-aware algorithm, ConceptBert, for questions which require common sense, or basic factual knowledge from external structured content. Given an image and a question in natural language, ConceptBert requires visual elements of the image and a Knowledge Graph (KG) to infer the correct answer. We introduce a multi-modal representation which learns a joint Concept-Vision-Language embedding inspired by the popular BERT architecture. We exploit ConceptNet KG for encoding the common sense knowledge and evaluate our methodology on the Outside Knowledge-VQA (OK-VQA) and VQA datasets.</abstract>
<identifier type="citekey">garderes-etal-2020-conceptbert</identifier>
<identifier type="doi">10.18653/v1/2020.findings-emnlp.44</identifier>
<location>
<url>https://aclanthology.org/2020.findings-emnlp.44/</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>489</start>
<end>498</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ConceptBert: Concept-Aware Representation for Visual Question Answering
%A Gardères, François
%A Ziaeefard, Maryam
%A Abeloos, Baptiste
%A Lecue, Freddy
%Y Cohn, Trevor
%Y He, Yulan
%Y Liu, Yang
%S Findings of the Association for Computational Linguistics: EMNLP 2020
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F garderes-etal-2020-conceptbert
%X Visual Question Answering (VQA) is a challenging task that has received increasing attention from both the computer vision and the natural language processing communities. A VQA model combines visual and textual features in order to answer questions grounded in an image. Current works in VQA focus on questions which are answerable by direct analysis of the question and image alone. We present a concept-aware algorithm, ConceptBert, for questions which require common sense, or basic factual knowledge from external structured content. Given an image and a question in natural language, ConceptBert requires visual elements of the image and a Knowledge Graph (KG) to infer the correct answer. We introduce a multi-modal representation which learns a joint Concept-Vision-Language embedding inspired by the popular BERT architecture. We exploit ConceptNet KG for encoding the common sense knowledge and evaluate our methodology on the Outside Knowledge-VQA (OK-VQA) and VQA datasets.
%R 10.18653/v1/2020.findings-emnlp.44
%U https://aclanthology.org/2020.findings-emnlp.44/
%U https://doi.org/10.18653/v1/2020.findings-emnlp.44
%P 489-498
Markdown (Informal)
[ConceptBert: Concept-Aware Representation for Visual Question Answering](https://aclanthology.org/2020.findings-emnlp.44/) (Gardères et al., Findings 2020)
ACL