@inproceedings{dessi-etal-2022-communication,
title = "Communication breakdown: On the low mutual intelligibility between human and neural captioning",
author = "Dess{\`\i}, Roberto and
Gualdoni, Eleonora and
Franzon, Francesca and
Boleda, Gemma and
Baroni, Marco",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.emnlp-main.546",
doi = "10.18653/v1/2022.emnlp-main.546",
pages = "7998--8007",
abstract = "We compare the 0-shot performance of a neural caption-based image retriever when given as input either human-produced captions or captions generated by a neural captioner. We conduct this comparison on the recently introduced ImageCoDe data-set (Krojer et al. 2022), which contains hard distractors nearly identical to the images to be retrieved. We find that the neural retriever has much higher performance when fed neural rather than human captions, despite the fact that the former, unlike the latter, were generated without awareness of the distractors that make the task hard. Even more remarkably, when the same neural captions are given to human subjects, their retrieval performance is almost at chance level. Our results thus add to the growing body of evidence that, even when the {``}language{''} of neural models resembles English, this superficial resemblance might be deeply misleading.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dessi-etal-2022-communication">
<titleInfo>
<title>Communication breakdown: On the low mutual intelligibility between human and neural captioning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Roberto</namePart>
<namePart type="family">Dessì</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eleonora</namePart>
<namePart type="family">Gualdoni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francesca</namePart>
<namePart type="family">Franzon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gemma</namePart>
<namePart type="family">Boleda</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marco</namePart>
<namePart type="family">Baroni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We compare the 0-shot performance of a neural caption-based image retriever when given as input either human-produced captions or captions generated by a neural captioner. We conduct this comparison on the recently introduced ImageCoDe data-set (Krojer et al. 2022), which contains hard distractors nearly identical to the images to be retrieved. We find that the neural retriever has much higher performance when fed neural rather than human captions, despite the fact that the former, unlike the latter, were generated without awareness of the distractors that make the task hard. Even more remarkably, when the same neural captions are given to human subjects, their retrieval performance is almost at chance level. Our results thus add to the growing body of evidence that, even when the “language” of neural models resembles English, this superficial resemblance might be deeply misleading.</abstract>
<identifier type="citekey">dessi-etal-2022-communication</identifier>
<identifier type="doi">10.18653/v1/2022.emnlp-main.546</identifier>
<location>
<url>https://aclanthology.org/2022.emnlp-main.546</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>7998</start>
<end>8007</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Communication breakdown: On the low mutual intelligibility between human and neural captioning
%A Dessì, Roberto
%A Gualdoni, Eleonora
%A Franzon, Francesca
%A Boleda, Gemma
%A Baroni, Marco
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F dessi-etal-2022-communication
%X We compare the 0-shot performance of a neural caption-based image retriever when given as input either human-produced captions or captions generated by a neural captioner. We conduct this comparison on the recently introduced ImageCoDe data-set (Krojer et al. 2022), which contains hard distractors nearly identical to the images to be retrieved. We find that the neural retriever has much higher performance when fed neural rather than human captions, despite the fact that the former, unlike the latter, were generated without awareness of the distractors that make the task hard. Even more remarkably, when the same neural captions are given to human subjects, their retrieval performance is almost at chance level. Our results thus add to the growing body of evidence that, even when the “language” of neural models resembles English, this superficial resemblance might be deeply misleading.
%R 10.18653/v1/2022.emnlp-main.546
%U https://aclanthology.org/2022.emnlp-main.546
%U https://doi.org/10.18653/v1/2022.emnlp-main.546
%P 7998-8007
Markdown (Informal)
[Communication breakdown: On the low mutual intelligibility between human and neural captioning](https://aclanthology.org/2022.emnlp-main.546) (Dessì et al., EMNLP 2022)
ACL