@inproceedings{testoni-etal-2020-alike,
title = "They Are Not All Alike: Answering Different Spatial Questions Requires Different Grounding Strategies",
author = "Testoni, Alberto and
Greco, Claudio and
Bianchi, Tobias and
Mazuecos, Mauricio and
Marcante, Agata and
Benotti, Luciana and
Bernardi, Raffaella",
editor = "Kordjamshidi, Parisa and
Bhatia, Archna and
Alikhani, Malihe and
Baldridge, Jason and
Bansal, Mohit and
Moens, Marie-Francine",
booktitle = "Proceedings of the Third International Workshop on Spatial Language Understanding",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.splu-1.4/",
doi = "10.18653/v1/2020.splu-1.4",
pages = "29--38",
abstract = "In this paper, we study the grounding skills required to answer spatial questions asked by humans while playing the GuessWhat?! game. We propose a classification for spatial questions dividing them into absolute, relational, and group questions. We build a new answerer model based on the LXMERT multimodal transformer and we compare a baseline with and without visual features of the scene. We are interested in studying how the attention mechanisms of LXMERT are used to answer spatial questions since they require putting attention on more than one region simultaneously and spotting the relation holding among them. We show that our proposed model outperforms the baseline by a large extent (9.70{\%} on spatial questions and 6.27{\%} overall). By analyzing LXMERT errors and its attention mechanisms, we find that our classification helps to gain a better understanding of the skills required to answer different spatial questions."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="testoni-etal-2020-alike">
<titleInfo>
<title>They Are Not All Alike: Answering Different Spatial Questions Requires Different Grounding Strategies</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alberto</namePart>
<namePart type="family">Testoni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claudio</namePart>
<namePart type="family">Greco</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tobias</namePart>
<namePart type="family">Bianchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mauricio</namePart>
<namePart type="family">Mazuecos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Agata</namePart>
<namePart type="family">Marcante</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luciana</namePart>
<namePart type="family">Benotti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raffaella</namePart>
<namePart type="family">Bernardi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third International Workshop on Spatial Language Understanding</title>
</titleInfo>
<name type="personal">
<namePart type="given">Parisa</namePart>
<namePart type="family">Kordjamshidi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Archna</namePart>
<namePart type="family">Bhatia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Malihe</namePart>
<namePart type="family">Alikhani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jason</namePart>
<namePart type="family">Baldridge</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie-Francine</namePart>
<namePart type="family">Moens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper, we study the grounding skills required to answer spatial questions asked by humans while playing the GuessWhat?! game. We propose a classification for spatial questions dividing them into absolute, relational, and group questions. We build a new answerer model based on the LXMERT multimodal transformer and we compare a baseline with and without visual features of the scene. We are interested in studying how the attention mechanisms of LXMERT are used to answer spatial questions since they require putting attention on more than one region simultaneously and spotting the relation holding among them. We show that our proposed model outperforms the baseline by a large extent (9.70% on spatial questions and 6.27% overall). By analyzing LXMERT errors and its attention mechanisms, we find that our classification helps to gain a better understanding of the skills required to answer different spatial questions.</abstract>
<identifier type="citekey">testoni-etal-2020-alike</identifier>
<identifier type="doi">10.18653/v1/2020.splu-1.4</identifier>
<location>
<url>https://aclanthology.org/2020.splu-1.4/</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>29</start>
<end>38</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T They Are Not All Alike: Answering Different Spatial Questions Requires Different Grounding Strategies
%A Testoni, Alberto
%A Greco, Claudio
%A Bianchi, Tobias
%A Mazuecos, Mauricio
%A Marcante, Agata
%A Benotti, Luciana
%A Bernardi, Raffaella
%Y Kordjamshidi, Parisa
%Y Bhatia, Archna
%Y Alikhani, Malihe
%Y Baldridge, Jason
%Y Bansal, Mohit
%Y Moens, Marie-Francine
%S Proceedings of the Third International Workshop on Spatial Language Understanding
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F testoni-etal-2020-alike
%X In this paper, we study the grounding skills required to answer spatial questions asked by humans while playing the GuessWhat?! game. We propose a classification for spatial questions dividing them into absolute, relational, and group questions. We build a new answerer model based on the LXMERT multimodal transformer and we compare a baseline with and without visual features of the scene. We are interested in studying how the attention mechanisms of LXMERT are used to answer spatial questions since they require putting attention on more than one region simultaneously and spotting the relation holding among them. We show that our proposed model outperforms the baseline by a large extent (9.70% on spatial questions and 6.27% overall). By analyzing LXMERT errors and its attention mechanisms, we find that our classification helps to gain a better understanding of the skills required to answer different spatial questions.
%R 10.18653/v1/2020.splu-1.4
%U https://aclanthology.org/2020.splu-1.4/
%U https://doi.org/10.18653/v1/2020.splu-1.4
%P 29-38
Markdown (Informal)
[They Are Not All Alike: Answering Different Spatial Questions Requires Different Grounding Strategies](https://aclanthology.org/2020.splu-1.4/) (Testoni et al., SpLU 2020)
ACL