@inproceedings{hromei-etal-2024-mm,
title = "{MM}-{IGLU}: Multi-Modal Interactive Grounded Language Understanding",
author = "Hromei, Claudiu Daniel and
Margiotta, Daniele and
Croce, Danilo and
Basili, Roberto",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.1000/",
pages = "11440--11451",
abstract = "This paper explores Interactive Grounded Language Understanding (IGLU) challenges within Human-Robot Interaction (HRI). In this setting, a robot interprets user commands related to its environment, aiming to discern whether a specific command can be executed. If faced with ambiguities or incomplete data, the robot poses relevant clarification questions. Drawing from the NeurIPS 2022 IGLU competition, we enrich the dataset by introducing our multi-modal data and natural language descriptions in \textit{MM-IGLU: Multi-Modal Interactive Grounded Language Understanding}. Utilizing a BART-based model that integrates the user`s statement with the environment`s description, and a cutting-edge Multi-Modal Large Language Model that merges both visual and textual data, we offer a valuable resource for ongoing research in the domain. Additionally, we discuss the evaluation methods for such tasks, highlighting potential limitations imposed by traditional string-match-based evaluations on this intricate multi-modal challenge. Moreover, we provide an evaluation benchmark based on human judgment to address the limits and capabilities of such baseline models. This resource is released on a dedicated GitHub repository at https://github.com/crux82/MM-IGLU."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hromei-etal-2024-mm">
<titleInfo>
<title>MM-IGLU: Multi-Modal Interactive Grounded Language Understanding</title>
</titleInfo>
<name type="personal">
<namePart type="given">Claudiu</namePart>
<namePart type="given">Daniel</namePart>
<namePart type="family">Hromei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniele</namePart>
<namePart type="family">Margiotta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Danilo</namePart>
<namePart type="family">Croce</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roberto</namePart>
<namePart type="family">Basili</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper explores Interactive Grounded Language Understanding (IGLU) challenges within Human-Robot Interaction (HRI). In this setting, a robot interprets user commands related to its environment, aiming to discern whether a specific command can be executed. If faced with ambiguities or incomplete data, the robot poses relevant clarification questions. Drawing from the NeurIPS 2022 IGLU competition, we enrich the dataset by introducing our multi-modal data and natural language descriptions in MM-IGLU: Multi-Modal Interactive Grounded Language Understanding. Utilizing a BART-based model that integrates the user‘s statement with the environment‘s description, and a cutting-edge Multi-Modal Large Language Model that merges both visual and textual data, we offer a valuable resource for ongoing research in the domain. Additionally, we discuss the evaluation methods for such tasks, highlighting potential limitations imposed by traditional string-match-based evaluations on this intricate multi-modal challenge. Moreover, we provide an evaluation benchmark based on human judgment to address the limits and capabilities of such baseline models. This resource is released on a dedicated GitHub repository at https://github.com/crux82/MM-IGLU.</abstract>
<identifier type="citekey">hromei-etal-2024-mm</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.1000/</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>11440</start>
<end>11451</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MM-IGLU: Multi-Modal Interactive Grounded Language Understanding
%A Hromei, Claudiu Daniel
%A Margiotta, Daniele
%A Croce, Danilo
%A Basili, Roberto
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F hromei-etal-2024-mm
%X This paper explores Interactive Grounded Language Understanding (IGLU) challenges within Human-Robot Interaction (HRI). In this setting, a robot interprets user commands related to its environment, aiming to discern whether a specific command can be executed. If faced with ambiguities or incomplete data, the robot poses relevant clarification questions. Drawing from the NeurIPS 2022 IGLU competition, we enrich the dataset by introducing our multi-modal data and natural language descriptions in MM-IGLU: Multi-Modal Interactive Grounded Language Understanding. Utilizing a BART-based model that integrates the user‘s statement with the environment‘s description, and a cutting-edge Multi-Modal Large Language Model that merges both visual and textual data, we offer a valuable resource for ongoing research in the domain. Additionally, we discuss the evaluation methods for such tasks, highlighting potential limitations imposed by traditional string-match-based evaluations on this intricate multi-modal challenge. Moreover, we provide an evaluation benchmark based on human judgment to address the limits and capabilities of such baseline models. This resource is released on a dedicated GitHub repository at https://github.com/crux82/MM-IGLU.
%U https://aclanthology.org/2024.lrec-main.1000/
%P 11440-11451
Markdown (Informal)
[MM-IGLU: Multi-Modal Interactive Grounded Language Understanding](https://aclanthology.org/2024.lrec-main.1000/) (Hromei et al., LREC-COLING 2024)
ACL
- Claudiu Daniel Hromei, Daniele Margiotta, Danilo Croce, and Roberto Basili. 2024. MM-IGLU: Multi-Modal Interactive Grounded Language Understanding. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), pages 11440–11451, Torino, Italia. ELRA and ICCL.