@inproceedings{srinivasan-etal-2020-fine,
title = "Fine-Grained Grounding for Multimodal Speech Recognition",
author = "Srinivasan, Tejas and
Sanabria, Ramon and
Metze, Florian and
Elliott, Desmond",
editor = "Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2020",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.findings-emnlp.242/",
doi = "10.18653/v1/2020.findings-emnlp.242",
pages = "2667--2677",
abstract = "Multimodal automatic speech recognition systems integrate information from images to improve speech recognition quality, by grounding the speech in the visual context. While visual signals have been shown to be useful for recovering entities that have been masked in the audio, these models should be capable of recovering a broader range of word types. Existing systems rely on global visual features that represent the entire image, but localizing the relevant regions of the image will make it possible to recover a larger set of words, such as adjectives and verbs. In this paper, we propose a model that uses finer-grained visual information from different parts of the image, using automatic object proposals. In experiments on the Flickr8K Audio Captions Corpus, we find that our model improves over approaches that use global visual features, that the proposals enable the model to recover entities and other related words, such as adjectives, and that improvements are due to the model`s ability to localize the correct proposals."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="srinivasan-etal-2020-fine">
<titleInfo>
<title>Fine-Grained Grounding for Multimodal Speech Recognition</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tejas</namePart>
<namePart type="family">Srinivasan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ramon</namePart>
<namePart type="family">Sanabria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Florian</namePart>
<namePart type="family">Metze</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Desmond</namePart>
<namePart type="family">Elliott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2020</title>
</titleInfo>
<name type="personal">
<namePart type="given">Trevor</namePart>
<namePart type="family">Cohn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Multimodal automatic speech recognition systems integrate information from images to improve speech recognition quality, by grounding the speech in the visual context. While visual signals have been shown to be useful for recovering entities that have been masked in the audio, these models should be capable of recovering a broader range of word types. Existing systems rely on global visual features that represent the entire image, but localizing the relevant regions of the image will make it possible to recover a larger set of words, such as adjectives and verbs. In this paper, we propose a model that uses finer-grained visual information from different parts of the image, using automatic object proposals. In experiments on the Flickr8K Audio Captions Corpus, we find that our model improves over approaches that use global visual features, that the proposals enable the model to recover entities and other related words, such as adjectives, and that improvements are due to the model‘s ability to localize the correct proposals.</abstract>
<identifier type="citekey">srinivasan-etal-2020-fine</identifier>
<identifier type="doi">10.18653/v1/2020.findings-emnlp.242</identifier>
<location>
<url>https://aclanthology.org/2020.findings-emnlp.242/</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>2667</start>
<end>2677</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Fine-Grained Grounding for Multimodal Speech Recognition
%A Srinivasan, Tejas
%A Sanabria, Ramon
%A Metze, Florian
%A Elliott, Desmond
%Y Cohn, Trevor
%Y He, Yulan
%Y Liu, Yang
%S Findings of the Association for Computational Linguistics: EMNLP 2020
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F srinivasan-etal-2020-fine
%X Multimodal automatic speech recognition systems integrate information from images to improve speech recognition quality, by grounding the speech in the visual context. While visual signals have been shown to be useful for recovering entities that have been masked in the audio, these models should be capable of recovering a broader range of word types. Existing systems rely on global visual features that represent the entire image, but localizing the relevant regions of the image will make it possible to recover a larger set of words, such as adjectives and verbs. In this paper, we propose a model that uses finer-grained visual information from different parts of the image, using automatic object proposals. In experiments on the Flickr8K Audio Captions Corpus, we find that our model improves over approaches that use global visual features, that the proposals enable the model to recover entities and other related words, such as adjectives, and that improvements are due to the model‘s ability to localize the correct proposals.
%R 10.18653/v1/2020.findings-emnlp.242
%U https://aclanthology.org/2020.findings-emnlp.242/
%U https://doi.org/10.18653/v1/2020.findings-emnlp.242
%P 2667-2677
Markdown (Informal)
[Fine-Grained Grounding for Multimodal Speech Recognition](https://aclanthology.org/2020.findings-emnlp.242/) (Srinivasan et al., Findings 2020)
ACL