@article{pezzelle-etal-2021-word,
title = "Word Representation Learning in Multimodal Pre-Trained Transformers: An Intrinsic Evaluation",
author = "Pezzelle, Sandro and
Takmaz, Ece and
Fern{\'a}ndez, Raquel",
editor = "Roark, Brian and
Nenkova, Ani",
journal = "Transactions of the Association for Computational Linguistics",
volume = "9",
year = "2021",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2021.tacl-1.93/",
doi = "10.1162/tacl_a_00443",
pages = "1563--1579",
abstract = "This study carries out a systematic intrinsic evaluation of the semantic representations learned by state-of-the-art pre-trained multimodal Transformers. These representations are claimed to be task-agnostic and shown to help on many downstream language-and-vision tasks. However, the extent to which they align with human semantic intuitions remains unclear. We experiment with various models and obtain static word representations from the contextualized ones they learn. We then evaluate them against the semantic judgments provided by human speakers. In line with previous evidence, we observe a generalized advantage of multimodal representations over language- only ones on concrete word pairs, but not on abstract ones. On the one hand, this confirms the effectiveness of these models to align language and vision, which results in better semantic representations for concepts that are grounded in images. On the other hand, models are shown to follow different representation learning patterns, which sheds some light on how and when they perform multimodal integration."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pezzelle-etal-2021-word">
<titleInfo>
<title>Word Representation Learning in Multimodal Pre-Trained Transformers: An Intrinsic Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sandro</namePart>
<namePart type="family">Pezzelle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ece</namePart>
<namePart type="family">Takmaz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raquel</namePart>
<namePart type="family">Fernández</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>This study carries out a systematic intrinsic evaluation of the semantic representations learned by state-of-the-art pre-trained multimodal Transformers. These representations are claimed to be task-agnostic and shown to help on many downstream language-and-vision tasks. However, the extent to which they align with human semantic intuitions remains unclear. We experiment with various models and obtain static word representations from the contextualized ones they learn. We then evaluate them against the semantic judgments provided by human speakers. In line with previous evidence, we observe a generalized advantage of multimodal representations over language- only ones on concrete word pairs, but not on abstract ones. On the one hand, this confirms the effectiveness of these models to align language and vision, which results in better semantic representations for concepts that are grounded in images. On the other hand, models are shown to follow different representation learning patterns, which sheds some light on how and when they perform multimodal integration.</abstract>
<identifier type="citekey">pezzelle-etal-2021-word</identifier>
<identifier type="doi">10.1162/tacl_a_00443</identifier>
<location>
<url>https://aclanthology.org/2021.tacl-1.93/</url>
</location>
<part>
<date>2021</date>
<detail type="volume"><number>9</number></detail>
<extent unit="page">
<start>1563</start>
<end>1579</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Word Representation Learning in Multimodal Pre-Trained Transformers: An Intrinsic Evaluation
%A Pezzelle, Sandro
%A Takmaz, Ece
%A Fernández, Raquel
%J Transactions of the Association for Computational Linguistics
%D 2021
%V 9
%I MIT Press
%C Cambridge, MA
%F pezzelle-etal-2021-word
%X This study carries out a systematic intrinsic evaluation of the semantic representations learned by state-of-the-art pre-trained multimodal Transformers. These representations are claimed to be task-agnostic and shown to help on many downstream language-and-vision tasks. However, the extent to which they align with human semantic intuitions remains unclear. We experiment with various models and obtain static word representations from the contextualized ones they learn. We then evaluate them against the semantic judgments provided by human speakers. In line with previous evidence, we observe a generalized advantage of multimodal representations over language- only ones on concrete word pairs, but not on abstract ones. On the one hand, this confirms the effectiveness of these models to align language and vision, which results in better semantic representations for concepts that are grounded in images. On the other hand, models are shown to follow different representation learning patterns, which sheds some light on how and when they perform multimodal integration.
%R 10.1162/tacl_a_00443
%U https://aclanthology.org/2021.tacl-1.93/
%U https://doi.org/10.1162/tacl_a_00443
%P 1563-1579
Markdown (Informal)
[Word Representation Learning in Multimodal Pre-Trained Transformers: An Intrinsic Evaluation](https://aclanthology.org/2021.tacl-1.93/) (Pezzelle et al., TACL 2021)
ACL