@inproceedings{diao-etal-2024-learning,
title = "Learning Musical Representations for Music Performance Question Answering",
author = "Diao, Xingjian and
Zhang, Chunhui and
Wu, Tingxuan and
Cheng, Ming and
Ouyang, Zhongyu and
Wu, Weiyi and
Gui, Jiang",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.159/",
doi = "10.18653/v1/2024.findings-emnlp.159",
pages = "2803--2813",
abstract = "Music performances are representative scenarios for audio-visual modeling. Unlike common scenarios with sparse audio, music performances continuously involve dense audio signals throughout. While existing multimodal learning methods on the audio-video QA demonstrate impressive capabilities on general scenarios, they are incapable of dealing with fundamental problems within the music performances: they underexplore the interaction between the multimodal signals in performance, and fail to consider the distinctive characteristics of instruments and music. Therefore, existing methods tend to inaccurately answer questions regarding musical performances. To bridge the above research gaps, first, given the intricate multimodal interconnectivity inherent to music data, our primary backbone is designed to incorporate multimodal interactions within the context of music; second, to enable the model to learn music characteristics, we annotate and release rhythmic and music sources in the current music datasets; third, for time-aware audio-visual modelling, we align the model`s music predictions with the temporal dimension. Our experiments show state-of-the-art effects on the Music AVQA datasets. Our code is available at: https://github.com/xid32/Amuse."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="diao-etal-2024-learning">
<titleInfo>
<title>Learning Musical Representations for Music Performance Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xingjian</namePart>
<namePart type="family">Diao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chunhui</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tingxuan</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ming</namePart>
<namePart type="family">Cheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhongyu</namePart>
<namePart type="family">Ouyang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Weiyi</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiang</namePart>
<namePart type="family">Gui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Music performances are representative scenarios for audio-visual modeling. Unlike common scenarios with sparse audio, music performances continuously involve dense audio signals throughout. While existing multimodal learning methods on the audio-video QA demonstrate impressive capabilities on general scenarios, they are incapable of dealing with fundamental problems within the music performances: they underexplore the interaction between the multimodal signals in performance, and fail to consider the distinctive characteristics of instruments and music. Therefore, existing methods tend to inaccurately answer questions regarding musical performances. To bridge the above research gaps, first, given the intricate multimodal interconnectivity inherent to music data, our primary backbone is designed to incorporate multimodal interactions within the context of music; second, to enable the model to learn music characteristics, we annotate and release rhythmic and music sources in the current music datasets; third, for time-aware audio-visual modelling, we align the model‘s music predictions with the temporal dimension. Our experiments show state-of-the-art effects on the Music AVQA datasets. Our code is available at: https://github.com/xid32/Amuse.</abstract>
<identifier type="citekey">diao-etal-2024-learning</identifier>
<identifier type="doi">10.18653/v1/2024.findings-emnlp.159</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.159/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>2803</start>
<end>2813</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Learning Musical Representations for Music Performance Question Answering
%A Diao, Xingjian
%A Zhang, Chunhui
%A Wu, Tingxuan
%A Cheng, Ming
%A Ouyang, Zhongyu
%A Wu, Weiyi
%A Gui, Jiang
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F diao-etal-2024-learning
%X Music performances are representative scenarios for audio-visual modeling. Unlike common scenarios with sparse audio, music performances continuously involve dense audio signals throughout. While existing multimodal learning methods on the audio-video QA demonstrate impressive capabilities on general scenarios, they are incapable of dealing with fundamental problems within the music performances: they underexplore the interaction between the multimodal signals in performance, and fail to consider the distinctive characteristics of instruments and music. Therefore, existing methods tend to inaccurately answer questions regarding musical performances. To bridge the above research gaps, first, given the intricate multimodal interconnectivity inherent to music data, our primary backbone is designed to incorporate multimodal interactions within the context of music; second, to enable the model to learn music characteristics, we annotate and release rhythmic and music sources in the current music datasets; third, for time-aware audio-visual modelling, we align the model‘s music predictions with the temporal dimension. Our experiments show state-of-the-art effects on the Music AVQA datasets. Our code is available at: https://github.com/xid32/Amuse.
%R 10.18653/v1/2024.findings-emnlp.159
%U https://aclanthology.org/2024.findings-emnlp.159/
%U https://doi.org/10.18653/v1/2024.findings-emnlp.159
%P 2803-2813
Markdown (Informal)
[Learning Musical Representations for Music Performance Question Answering](https://aclanthology.org/2024.findings-emnlp.159/) (Diao et al., Findings 2024)
ACL
- Xingjian Diao, Chunhui Zhang, Tingxuan Wu, Ming Cheng, Zhongyu Ouyang, Weiyi Wu, and Jiang Gui. 2024. Learning Musical Representations for Music Performance Question Answering. In Findings of the Association for Computational Linguistics: EMNLP 2024, pages 2803–2813, Miami, Florida, USA. Association for Computational Linguistics.