@inproceedings{li-etal-2023-multi-modal-debiasing,
title = "A Multi-modal Debiasing Model with Dynamical Constraint for Robust Visual Question Answering",
author = "Li, Yu and
Hu, Bojie and
Zhang, Fengshuo and
Yu, Yahan and
Liu, Jian and
Chen, Yufeng and
Xu, Jinan",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-acl.311/",
doi = "10.18653/v1/2023.findings-acl.311",
pages = "5032--5045",
abstract = "Recent studies have pointed out that many well-developed Visual Question Answering (VQA) systems suffer from bias problem. Despite the remarkable performance gained on In-Distribution (ID) datasets, the VQA model might merely capture the superficial correlation from question to answer rather than showing real reasoning abilities. Therefore, when switching to Out-of-Distribution (OOD) dataset, whose test distribution is unknown or even reversed with the training set, significant drops might be demonstrated. Although efforts have been devoted to easing the negative bias effect brought by language prior and analysing its inherent cause, they are still limited by the following two aspects. First, most current debiasing methods achieve promising OOD generalization ability with a major sacrifice of the ID performance. Second, existing researches are restricted by exploiting comprehensive biases, since weakening the language bias is mainly focused, while only a few works consider vision bias. In this paper, we investigate a straightforward way to mitigate bias problem for VQA task. Specifically, we reduce bias effect by subtracting bias score from standard VQA base score. Based on such a direct strategy, we design two bias learning branches to detect more bias information, which are combined with a dynamical constraint loss to alleviate the problem of over-correction and insufficient debiasing effect. We evaluate our method on the challenging VQA v2.0 and VQA-CP V2,0 datasets and the proposed method achievessignificant improvement."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="li-etal-2023-multi-modal-debiasing">
<titleInfo>
<title>A Multi-modal Debiasing Model with Dynamical Constraint for Robust Visual Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yu</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bojie</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fengshuo</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yahan</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jian</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yufeng</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jinan</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent studies have pointed out that many well-developed Visual Question Answering (VQA) systems suffer from bias problem. Despite the remarkable performance gained on In-Distribution (ID) datasets, the VQA model might merely capture the superficial correlation from question to answer rather than showing real reasoning abilities. Therefore, when switching to Out-of-Distribution (OOD) dataset, whose test distribution is unknown or even reversed with the training set, significant drops might be demonstrated. Although efforts have been devoted to easing the negative bias effect brought by language prior and analysing its inherent cause, they are still limited by the following two aspects. First, most current debiasing methods achieve promising OOD generalization ability with a major sacrifice of the ID performance. Second, existing researches are restricted by exploiting comprehensive biases, since weakening the language bias is mainly focused, while only a few works consider vision bias. In this paper, we investigate a straightforward way to mitigate bias problem for VQA task. Specifically, we reduce bias effect by subtracting bias score from standard VQA base score. Based on such a direct strategy, we design two bias learning branches to detect more bias information, which are combined with a dynamical constraint loss to alleviate the problem of over-correction and insufficient debiasing effect. We evaluate our method on the challenging VQA v2.0 and VQA-CP V2,0 datasets and the proposed method achievessignificant improvement.</abstract>
<identifier type="citekey">li-etal-2023-multi-modal-debiasing</identifier>
<identifier type="doi">10.18653/v1/2023.findings-acl.311</identifier>
<location>
<url>https://aclanthology.org/2023.findings-acl.311/</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>5032</start>
<end>5045</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Multi-modal Debiasing Model with Dynamical Constraint for Robust Visual Question Answering
%A Li, Yu
%A Hu, Bojie
%A Zhang, Fengshuo
%A Yu, Yahan
%A Liu, Jian
%A Chen, Yufeng
%A Xu, Jinan
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Findings of the Association for Computational Linguistics: ACL 2023
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F li-etal-2023-multi-modal-debiasing
%X Recent studies have pointed out that many well-developed Visual Question Answering (VQA) systems suffer from bias problem. Despite the remarkable performance gained on In-Distribution (ID) datasets, the VQA model might merely capture the superficial correlation from question to answer rather than showing real reasoning abilities. Therefore, when switching to Out-of-Distribution (OOD) dataset, whose test distribution is unknown or even reversed with the training set, significant drops might be demonstrated. Although efforts have been devoted to easing the negative bias effect brought by language prior and analysing its inherent cause, they are still limited by the following two aspects. First, most current debiasing methods achieve promising OOD generalization ability with a major sacrifice of the ID performance. Second, existing researches are restricted by exploiting comprehensive biases, since weakening the language bias is mainly focused, while only a few works consider vision bias. In this paper, we investigate a straightforward way to mitigate bias problem for VQA task. Specifically, we reduce bias effect by subtracting bias score from standard VQA base score. Based on such a direct strategy, we design two bias learning branches to detect more bias information, which are combined with a dynamical constraint loss to alleviate the problem of over-correction and insufficient debiasing effect. We evaluate our method on the challenging VQA v2.0 and VQA-CP V2,0 datasets and the proposed method achievessignificant improvement.
%R 10.18653/v1/2023.findings-acl.311
%U https://aclanthology.org/2023.findings-acl.311/
%U https://doi.org/10.18653/v1/2023.findings-acl.311
%P 5032-5045
Markdown (Informal)
[A Multi-modal Debiasing Model with Dynamical Constraint for Robust Visual Question Answering](https://aclanthology.org/2023.findings-acl.311/) (Li et al., Findings 2023)
ACL