@inproceedings{ji-etal-2024-towards,
title = "Towards One-to-Many Visual Question Answering",
author = "Ji, Huishan and
Si, Qingyi and
Lin, Zheng and
Cao, Yanan and
Wang, Weiping",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.985/",
doi = "10.18653/v1/2024.findings-emnlp.985",
pages = "16931--16943",
abstract = "Most existing Visual Question Answering (VQA) systems are constrained to support domain-specific questions, i.e., to train different models separately for different VQA tasks, thus generalizing poorly to others. For example, models trained on the reasoning-focused dataset GQA struggle to effectively handle samples from the knowledge-emphasizing dataset OKVQA. Meanwhile, in real-world scenarios, it is user-unfriendly to restrict the domain of questions. Therefore, this paper proposes a necessary task: One-to-Many Visual Question Answering, of which the ultimate goal is to enable a single model to answer as many different domains of questions as possible by the effective integration of available VQA resources. To this end, we first investigate into ten common VQA datasets, and break the task of VQA down into the integration of three key abilities.Then, considering assorted questions rely on different VQA abilities, this paper proposes a novel dynamic Mixture of LoRAs (MoL) strategy. MoL mixes three individually trained LoRA adapters (corresponding to each VQA ability) dynamically for different samples demanding various VQA abilities. The proposed MoL strategy is verified to be highly effective by experiments, establishing SOTAs on four datasets. In addition, MoL generalizes well to three extra zero-shot datasets.Data and codes will be released."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ji-etal-2024-towards">
<titleInfo>
<title>Towards One-to-Many Visual Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Huishan</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qingyi</namePart>
<namePart type="family">Si</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zheng</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yanan</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Weiping</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Most existing Visual Question Answering (VQA) systems are constrained to support domain-specific questions, i.e., to train different models separately for different VQA tasks, thus generalizing poorly to others. For example, models trained on the reasoning-focused dataset GQA struggle to effectively handle samples from the knowledge-emphasizing dataset OKVQA. Meanwhile, in real-world scenarios, it is user-unfriendly to restrict the domain of questions. Therefore, this paper proposes a necessary task: One-to-Many Visual Question Answering, of which the ultimate goal is to enable a single model to answer as many different domains of questions as possible by the effective integration of available VQA resources. To this end, we first investigate into ten common VQA datasets, and break the task of VQA down into the integration of three key abilities.Then, considering assorted questions rely on different VQA abilities, this paper proposes a novel dynamic Mixture of LoRAs (MoL) strategy. MoL mixes three individually trained LoRA adapters (corresponding to each VQA ability) dynamically for different samples demanding various VQA abilities. The proposed MoL strategy is verified to be highly effective by experiments, establishing SOTAs on four datasets. In addition, MoL generalizes well to three extra zero-shot datasets.Data and codes will be released.</abstract>
<identifier type="citekey">ji-etal-2024-towards</identifier>
<identifier type="doi">10.18653/v1/2024.findings-emnlp.985</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.985/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>16931</start>
<end>16943</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards One-to-Many Visual Question Answering
%A Ji, Huishan
%A Si, Qingyi
%A Lin, Zheng
%A Cao, Yanan
%A Wang, Weiping
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F ji-etal-2024-towards
%X Most existing Visual Question Answering (VQA) systems are constrained to support domain-specific questions, i.e., to train different models separately for different VQA tasks, thus generalizing poorly to others. For example, models trained on the reasoning-focused dataset GQA struggle to effectively handle samples from the knowledge-emphasizing dataset OKVQA. Meanwhile, in real-world scenarios, it is user-unfriendly to restrict the domain of questions. Therefore, this paper proposes a necessary task: One-to-Many Visual Question Answering, of which the ultimate goal is to enable a single model to answer as many different domains of questions as possible by the effective integration of available VQA resources. To this end, we first investigate into ten common VQA datasets, and break the task of VQA down into the integration of three key abilities.Then, considering assorted questions rely on different VQA abilities, this paper proposes a novel dynamic Mixture of LoRAs (MoL) strategy. MoL mixes three individually trained LoRA adapters (corresponding to each VQA ability) dynamically for different samples demanding various VQA abilities. The proposed MoL strategy is verified to be highly effective by experiments, establishing SOTAs on four datasets. In addition, MoL generalizes well to three extra zero-shot datasets.Data and codes will be released.
%R 10.18653/v1/2024.findings-emnlp.985
%U https://aclanthology.org/2024.findings-emnlp.985/
%U https://doi.org/10.18653/v1/2024.findings-emnlp.985
%P 16931-16943
Markdown (Informal)
[Towards One-to-Many Visual Question Answering](https://aclanthology.org/2024.findings-emnlp.985/) (Ji et al., Findings 2024)
ACL
- Huishan Ji, Qingyi Si, Zheng Lin, Yanan Cao, and Weiping Wang. 2024. Towards One-to-Many Visual Question Answering. In Findings of the Association for Computational Linguistics: EMNLP 2024, pages 16931–16943, Miami, Florida, USA. Association for Computational Linguistics.