@inproceedings{fei-etal-2024-multimodal,
title = "From Multimodal {LLM} to Human-level {AI}: Modality, Instruction, Reasoning, Efficiency and beyond",
author = "Fei, Hao and
Yao, Yuan and
Zhang, Zhuosheng and
Liu, Fuxiao and
Zhang, Ao and
Chua, Tat-Seng",
editor = "Klinger, Roman and
Okazaki, Naozaki and
Calzolari, Nicoletta and
Kan, Min-Yen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024): Tutorial Summaries",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-tutorials.1",
pages = "1--8",
abstract = "Artificial intelligence (AI) encompasses knowledge acquisition and real-world grounding across various modalities. As a multidisciplinary research field, multimodal large language models (MLLMs) have recently garnered growing interest in both academia and industry, showing an unprecedented trend to achieve human-level AI via MLLMs. These large models offer an effective vehicle for understanding, reasoning, and planning by integrating and modeling diverse information modalities, including language, visual, auditory, and sensory data. This tutorial aims to deliver a comprehensive review of cutting-edge research in MLLMs, focusing on four key areas: MLLM architecture design, instructional learning, multimodal reasoning, and the efficiency of MLLMs. We will explore technical advancements, synthesize key challenges, and discuss potential avenues for future research.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="fei-etal-2024-multimodal">
<titleInfo>
<title>From Multimodal LLM to Human-level AI: Modality, Instruction, Reasoning, Efficiency and beyond</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hao</namePart>
<namePart type="family">Fei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuan</namePart>
<namePart type="family">Yao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhuosheng</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fuxiao</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ao</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tat-Seng</namePart>
<namePart type="family">Chua</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024): Tutorial Summaries</title>
</titleInfo>
<name type="personal">
<namePart type="given">Roman</namePart>
<namePart type="family">Klinger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naozaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Artificial intelligence (AI) encompasses knowledge acquisition and real-world grounding across various modalities. As a multidisciplinary research field, multimodal large language models (MLLMs) have recently garnered growing interest in both academia and industry, showing an unprecedented trend to achieve human-level AI via MLLMs. These large models offer an effective vehicle for understanding, reasoning, and planning by integrating and modeling diverse information modalities, including language, visual, auditory, and sensory data. This tutorial aims to deliver a comprehensive review of cutting-edge research in MLLMs, focusing on four key areas: MLLM architecture design, instructional learning, multimodal reasoning, and the efficiency of MLLMs. We will explore technical advancements, synthesize key challenges, and discuss potential avenues for future research.</abstract>
<identifier type="citekey">fei-etal-2024-multimodal</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-tutorials.1</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>1</start>
<end>8</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T From Multimodal LLM to Human-level AI: Modality, Instruction, Reasoning, Efficiency and beyond
%A Fei, Hao
%A Yao, Yuan
%A Zhang, Zhuosheng
%A Liu, Fuxiao
%A Zhang, Ao
%A Chua, Tat-Seng
%Y Klinger, Roman
%Y Okazaki, Naozaki
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024): Tutorial Summaries
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F fei-etal-2024-multimodal
%X Artificial intelligence (AI) encompasses knowledge acquisition and real-world grounding across various modalities. As a multidisciplinary research field, multimodal large language models (MLLMs) have recently garnered growing interest in both academia and industry, showing an unprecedented trend to achieve human-level AI via MLLMs. These large models offer an effective vehicle for understanding, reasoning, and planning by integrating and modeling diverse information modalities, including language, visual, auditory, and sensory data. This tutorial aims to deliver a comprehensive review of cutting-edge research in MLLMs, focusing on four key areas: MLLM architecture design, instructional learning, multimodal reasoning, and the efficiency of MLLMs. We will explore technical advancements, synthesize key challenges, and discuss potential avenues for future research.
%U https://aclanthology.org/2024.lrec-tutorials.1
%P 1-8
Markdown (Informal)
[From Multimodal LLM to Human-level AI: Modality, Instruction, Reasoning, Efficiency and beyond](https://aclanthology.org/2024.lrec-tutorials.1) (Fei et al., LREC-COLING 2024)
ACL
- Hao Fei, Yuan Yao, Zhuosheng Zhang, Fuxiao Liu, Ao Zhang, and Tat-Seng Chua. 2024. From Multimodal LLM to Human-level AI: Modality, Instruction, Reasoning, Efficiency and beyond. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024): Tutorial Summaries, pages 1–8, Torino, Italia. ELRA and ICCL.