@inproceedings{mitra-etal-2024-one,
title = "Which One? Leveraging Context Between Objects and Multiple Views for Language Grounding",
author = "Mitra, Chancharik and
Anwar, Abrar and
Corona, Rodolfo and
Klein, Dan and
Darrell, Trevor and
Thomason, Jesse",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.naacl-long.175/",
doi = "10.18653/v1/2024.naacl-long.175",
pages = "3177--3189",
abstract = "When connecting objects and their language referents in an embodied 3D environment, it is important to note that: (1) an object can be better characterized by leveraging comparative information between itself and other objects, and (2) an object`s appearance can vary with camera position. As such, we present the Multi-view Approach to Grounding in Context (MAGiC) model, which selects an object referent based on language that distinguishes between two similar objects. By pragmatically reasoning over both objects and across multiple views of those objects, MAGiC improves over the state-of-the-art model on the SNARE object reference task with a relative error reduction of 12.9{\%} (representing an absolute improvement of 2.7{\%}). Ablation studies show that reasoning jointly over object referent candidates and multiple views of each object both contribute to improved accuracy. Code: https://github.com/rcorona/magic{\_}snare/"
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mitra-etal-2024-one">
<titleInfo>
<title>Which One? Leveraging Context Between Objects and Multiple Views for Language Grounding</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chancharik</namePart>
<namePart type="family">Mitra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abrar</namePart>
<namePart type="family">Anwar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rodolfo</namePart>
<namePart type="family">Corona</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dan</namePart>
<namePart type="family">Klein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Trevor</namePart>
<namePart type="family">Darrell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jesse</namePart>
<namePart type="family">Thomason</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>When connecting objects and their language referents in an embodied 3D environment, it is important to note that: (1) an object can be better characterized by leveraging comparative information between itself and other objects, and (2) an object‘s appearance can vary with camera position. As such, we present the Multi-view Approach to Grounding in Context (MAGiC) model, which selects an object referent based on language that distinguishes between two similar objects. By pragmatically reasoning over both objects and across multiple views of those objects, MAGiC improves over the state-of-the-art model on the SNARE object reference task with a relative error reduction of 12.9% (representing an absolute improvement of 2.7%). Ablation studies show that reasoning jointly over object referent candidates and multiple views of each object both contribute to improved accuracy. Code: https://github.com/rcorona/magic_snare/</abstract>
<identifier type="citekey">mitra-etal-2024-one</identifier>
<identifier type="doi">10.18653/v1/2024.naacl-long.175</identifier>
<location>
<url>https://aclanthology.org/2024.naacl-long.175/</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>3177</start>
<end>3189</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Which One? Leveraging Context Between Objects and Multiple Views for Language Grounding
%A Mitra, Chancharik
%A Anwar, Abrar
%A Corona, Rodolfo
%A Klein, Dan
%A Darrell, Trevor
%A Thomason, Jesse
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F mitra-etal-2024-one
%X When connecting objects and their language referents in an embodied 3D environment, it is important to note that: (1) an object can be better characterized by leveraging comparative information between itself and other objects, and (2) an object‘s appearance can vary with camera position. As such, we present the Multi-view Approach to Grounding in Context (MAGiC) model, which selects an object referent based on language that distinguishes between two similar objects. By pragmatically reasoning over both objects and across multiple views of those objects, MAGiC improves over the state-of-the-art model on the SNARE object reference task with a relative error reduction of 12.9% (representing an absolute improvement of 2.7%). Ablation studies show that reasoning jointly over object referent candidates and multiple views of each object both contribute to improved accuracy. Code: https://github.com/rcorona/magic_snare/
%R 10.18653/v1/2024.naacl-long.175
%U https://aclanthology.org/2024.naacl-long.175/
%U https://doi.org/10.18653/v1/2024.naacl-long.175
%P 3177-3189
Markdown (Informal)
[Which One? Leveraging Context Between Objects and Multiple Views for Language Grounding](https://aclanthology.org/2024.naacl-long.175/) (Mitra et al., NAACL 2024)
ACL
- Chancharik Mitra, Abrar Anwar, Rodolfo Corona, Dan Klein, Trevor Darrell, and Jesse Thomason. 2024. Which One? Leveraging Context Between Objects and Multiple Views for Language Grounding. In Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 3177–3189, Mexico City, Mexico. Association for Computational Linguistics.