@inproceedings{shah-etal-2020-reasoning,
title = "Reasoning Over History: Context Aware Visual Dialog",
author = "Shah, Muhammad and
Mehri, Shikib and
Srinivasan, Tejas",
editor = "Castellucci, Giuseppe and
Filice, Simone and
Poria, Soujanya and
Cambria, Erik and
Specia, Lucia",
booktitle = "Proceedings of the First International Workshop on Natural Language Processing Beyond Text",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.nlpbt-1.9/",
doi = "10.18653/v1/2020.nlpbt-1.9",
pages = "75--83",
abstract = "While neural models have been shown to exhibit strong performance on single-turn visual question answering (VQA) tasks, extending VQA to a multi-turn, conversational setting remains a challenge. One way to address this challenge is to augment existing strong neural VQA models with the mechanisms that allow them to retain information from previous dialog turns. One strong VQA model is the MAC network, which decomposes a task into a series of attention-based reasoning steps. However, since the MAC network is designed for single-turn question answering, it is not capable of referring to past dialog turns. More specifically, it struggles with tasks that require reasoning over the dialog history, particularly coreference resolution. We extend the MAC network architecture with Context-aware Attention and Memory (CAM), which attends over control states in past dialog turns to determine the necessary reasoning operations for the current question. MAC nets with CAM achieve up to 98.25{\%} accuracy on the CLEVR-Dialog dataset, beating the existing state-of-the-art by 30{\%} (absolute). Our error analysis indicates that with CAM, the model`s performance particularly improved on questions that required coreference resolution."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="shah-etal-2020-reasoning">
<titleInfo>
<title>Reasoning Over History: Context Aware Visual Dialog</title>
</titleInfo>
<name type="personal">
<namePart type="given">Muhammad</namePart>
<namePart type="family">Shah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shikib</namePart>
<namePart type="family">Mehri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tejas</namePart>
<namePart type="family">Srinivasan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First International Workshop on Natural Language Processing Beyond Text</title>
</titleInfo>
<name type="personal">
<namePart type="given">Giuseppe</namePart>
<namePart type="family">Castellucci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simone</namePart>
<namePart type="family">Filice</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Soujanya</namePart>
<namePart type="family">Poria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Erik</namePart>
<namePart type="family">Cambria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="family">Specia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>While neural models have been shown to exhibit strong performance on single-turn visual question answering (VQA) tasks, extending VQA to a multi-turn, conversational setting remains a challenge. One way to address this challenge is to augment existing strong neural VQA models with the mechanisms that allow them to retain information from previous dialog turns. One strong VQA model is the MAC network, which decomposes a task into a series of attention-based reasoning steps. However, since the MAC network is designed for single-turn question answering, it is not capable of referring to past dialog turns. More specifically, it struggles with tasks that require reasoning over the dialog history, particularly coreference resolution. We extend the MAC network architecture with Context-aware Attention and Memory (CAM), which attends over control states in past dialog turns to determine the necessary reasoning operations for the current question. MAC nets with CAM achieve up to 98.25% accuracy on the CLEVR-Dialog dataset, beating the existing state-of-the-art by 30% (absolute). Our error analysis indicates that with CAM, the model‘s performance particularly improved on questions that required coreference resolution.</abstract>
<identifier type="citekey">shah-etal-2020-reasoning</identifier>
<identifier type="doi">10.18653/v1/2020.nlpbt-1.9</identifier>
<location>
<url>https://aclanthology.org/2020.nlpbt-1.9/</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>75</start>
<end>83</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Reasoning Over History: Context Aware Visual Dialog
%A Shah, Muhammad
%A Mehri, Shikib
%A Srinivasan, Tejas
%Y Castellucci, Giuseppe
%Y Filice, Simone
%Y Poria, Soujanya
%Y Cambria, Erik
%Y Specia, Lucia
%S Proceedings of the First International Workshop on Natural Language Processing Beyond Text
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F shah-etal-2020-reasoning
%X While neural models have been shown to exhibit strong performance on single-turn visual question answering (VQA) tasks, extending VQA to a multi-turn, conversational setting remains a challenge. One way to address this challenge is to augment existing strong neural VQA models with the mechanisms that allow them to retain information from previous dialog turns. One strong VQA model is the MAC network, which decomposes a task into a series of attention-based reasoning steps. However, since the MAC network is designed for single-turn question answering, it is not capable of referring to past dialog turns. More specifically, it struggles with tasks that require reasoning over the dialog history, particularly coreference resolution. We extend the MAC network architecture with Context-aware Attention and Memory (CAM), which attends over control states in past dialog turns to determine the necessary reasoning operations for the current question. MAC nets with CAM achieve up to 98.25% accuracy on the CLEVR-Dialog dataset, beating the existing state-of-the-art by 30% (absolute). Our error analysis indicates that with CAM, the model‘s performance particularly improved on questions that required coreference resolution.
%R 10.18653/v1/2020.nlpbt-1.9
%U https://aclanthology.org/2020.nlpbt-1.9/
%U https://doi.org/10.18653/v1/2020.nlpbt-1.9
%P 75-83
Markdown (Informal)
[Reasoning Over History: Context Aware Visual Dialog](https://aclanthology.org/2020.nlpbt-1.9/) (Shah et al., nlpbt 2020)
ACL
- Muhammad Shah, Shikib Mehri, and Tejas Srinivasan. 2020. Reasoning Over History: Context Aware Visual Dialog. In Proceedings of the First International Workshop on Natural Language Processing Beyond Text, pages 75–83, Online. Association for Computational Linguistics.