@inproceedings{greco-etal-2022-small,
title = "A Small but Informed and Diverse Model: The Case of the Multimodal {G}uess{W}hat!? Guessing Game",
author = "Greco, Claudio and
Testoni, Alberto and
Bernardi, Raffaella and
Frank, Stella",
editor = "Dobnik, Simon and
Grove, Julian and
Sayeed, Asad",
booktitle = "Proceedings of the 2022 CLASP Conference on (Dis)embodiment",
month = sep,
year = "2022",
address = "Gothenburg, Sweden",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.clasp-1.1/",
pages = "1--10",
abstract = "Pre-trained Vision and Language Transformers achieve high performance on downstream tasks due to their ability to transfer representational knowledge accumulated during pretraining on substantial amounts of data. In this paper, we ask whether it is possible to compete with such models using features based on transferred (pre-trained, frozen) representations combined with a lightweight architecture. We take a multimodal guessing task as our testbed, GuessWhat?!. An ensemble of our lightweight model matches the performance of the finetuned pre-trained transformer (LXMERT). An uncertainty analysis of our ensemble shows that the lightweight transferred representations close the data uncertainty gap with LXMERT, while retaining model diversity leading to ensemble boost. We further demonstrate that LXMERT`s performance gain is due solely to its extra V{\&}L pretraining rather than because of architectural improvements. These results argue for flexible integration of multiple features and lightweight models as a viable alternative to large, cumbersome, pre-trained models."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="greco-etal-2022-small">
<titleInfo>
<title>A Small but Informed and Diverse Model: The Case of the Multimodal GuessWhat!? Guessing Game</title>
</titleInfo>
<name type="personal">
<namePart type="given">Claudio</namePart>
<namePart type="family">Greco</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alberto</namePart>
<namePart type="family">Testoni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raffaella</namePart>
<namePart type="family">Bernardi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stella</namePart>
<namePart type="family">Frank</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2022 CLASP Conference on (Dis)embodiment</title>
</titleInfo>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Dobnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julian</namePart>
<namePart type="family">Grove</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asad</namePart>
<namePart type="family">Sayeed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Gothenburg, Sweden</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Pre-trained Vision and Language Transformers achieve high performance on downstream tasks due to their ability to transfer representational knowledge accumulated during pretraining on substantial amounts of data. In this paper, we ask whether it is possible to compete with such models using features based on transferred (pre-trained, frozen) representations combined with a lightweight architecture. We take a multimodal guessing task as our testbed, GuessWhat?!. An ensemble of our lightweight model matches the performance of the finetuned pre-trained transformer (LXMERT). An uncertainty analysis of our ensemble shows that the lightweight transferred representations close the data uncertainty gap with LXMERT, while retaining model diversity leading to ensemble boost. We further demonstrate that LXMERT‘s performance gain is due solely to its extra V&L pretraining rather than because of architectural improvements. These results argue for flexible integration of multiple features and lightweight models as a viable alternative to large, cumbersome, pre-trained models.</abstract>
<identifier type="citekey">greco-etal-2022-small</identifier>
<location>
<url>https://aclanthology.org/2022.clasp-1.1/</url>
</location>
<part>
<date>2022-09</date>
<extent unit="page">
<start>1</start>
<end>10</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Small but Informed and Diverse Model: The Case of the Multimodal GuessWhat!? Guessing Game
%A Greco, Claudio
%A Testoni, Alberto
%A Bernardi, Raffaella
%A Frank, Stella
%Y Dobnik, Simon
%Y Grove, Julian
%Y Sayeed, Asad
%S Proceedings of the 2022 CLASP Conference on (Dis)embodiment
%D 2022
%8 September
%I Association for Computational Linguistics
%C Gothenburg, Sweden
%F greco-etal-2022-small
%X Pre-trained Vision and Language Transformers achieve high performance on downstream tasks due to their ability to transfer representational knowledge accumulated during pretraining on substantial amounts of data. In this paper, we ask whether it is possible to compete with such models using features based on transferred (pre-trained, frozen) representations combined with a lightweight architecture. We take a multimodal guessing task as our testbed, GuessWhat?!. An ensemble of our lightweight model matches the performance of the finetuned pre-trained transformer (LXMERT). An uncertainty analysis of our ensemble shows that the lightweight transferred representations close the data uncertainty gap with LXMERT, while retaining model diversity leading to ensemble boost. We further demonstrate that LXMERT‘s performance gain is due solely to its extra V&L pretraining rather than because of architectural improvements. These results argue for flexible integration of multiple features and lightweight models as a viable alternative to large, cumbersome, pre-trained models.
%U https://aclanthology.org/2022.clasp-1.1/
%P 1-10
Markdown (Informal)
[A Small but Informed and Diverse Model: The Case of the Multimodal GuessWhat!? Guessing Game](https://aclanthology.org/2022.clasp-1.1/) (Greco et al., CLASP 2022)
ACL