@inproceedings{ding-etal-2024-learn,
title = "Can We Learn Question, Answer, and Distractors All from an Image? A New Task for Multiple-choice Visual Question Answering",
author = "Ding, Wenjian and
Zhang, Yao and
Wang, Jun and
Jatowt, Adam and
Yang, Zhenglu",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.254/",
pages = "2852--2863",
abstract = "Multiple-choice visual question answering (MC VQA) requires an answer picked from a list of distractors, based on a question and an image. This research has attracted wide interest from the fields of visual question answering, visual question generation, and visual distractor generation. However, these fields still stay in their own territories, and how to jointly generate meaningful questions, correct answers, and challenging distractors remains unexplored. In this paper, we introduce a novel task, Visual Question-Answer-Distractors Generation (VQADG), which can bridge this research gap as well as take as a cornerstone to promote existing VQA models. Specific to the VQADG task, we present a novel framework consisting of a vision-and-language model to encode the given image and generate QADs jointly, and contrastive learning to ensure the consistency of the generated question, answer, and distractors. Empirical evaluations on the benchmark dataset validate the performance of our model in the VQADG task."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ding-etal-2024-learn">
<titleInfo>
<title>Can We Learn Question, Answer, and Distractors All from an Image? A New Task for Multiple-choice Visual Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wenjian</namePart>
<namePart type="family">Ding</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yao</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jun</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adam</namePart>
<namePart type="family">Jatowt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhenglu</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Multiple-choice visual question answering (MC VQA) requires an answer picked from a list of distractors, based on a question and an image. This research has attracted wide interest from the fields of visual question answering, visual question generation, and visual distractor generation. However, these fields still stay in their own territories, and how to jointly generate meaningful questions, correct answers, and challenging distractors remains unexplored. In this paper, we introduce a novel task, Visual Question-Answer-Distractors Generation (VQADG), which can bridge this research gap as well as take as a cornerstone to promote existing VQA models. Specific to the VQADG task, we present a novel framework consisting of a vision-and-language model to encode the given image and generate QADs jointly, and contrastive learning to ensure the consistency of the generated question, answer, and distractors. Empirical evaluations on the benchmark dataset validate the performance of our model in the VQADG task.</abstract>
<identifier type="citekey">ding-etal-2024-learn</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.254/</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>2852</start>
<end>2863</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Can We Learn Question, Answer, and Distractors All from an Image? A New Task for Multiple-choice Visual Question Answering
%A Ding, Wenjian
%A Zhang, Yao
%A Wang, Jun
%A Jatowt, Adam
%A Yang, Zhenglu
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F ding-etal-2024-learn
%X Multiple-choice visual question answering (MC VQA) requires an answer picked from a list of distractors, based on a question and an image. This research has attracted wide interest from the fields of visual question answering, visual question generation, and visual distractor generation. However, these fields still stay in their own territories, and how to jointly generate meaningful questions, correct answers, and challenging distractors remains unexplored. In this paper, we introduce a novel task, Visual Question-Answer-Distractors Generation (VQADG), which can bridge this research gap as well as take as a cornerstone to promote existing VQA models. Specific to the VQADG task, we present a novel framework consisting of a vision-and-language model to encode the given image and generate QADs jointly, and contrastive learning to ensure the consistency of the generated question, answer, and distractors. Empirical evaluations on the benchmark dataset validate the performance of our model in the VQADG task.
%U https://aclanthology.org/2024.lrec-main.254/
%P 2852-2863
Markdown (Informal)
[Can We Learn Question, Answer, and Distractors All from an Image? A New Task for Multiple-choice Visual Question Answering](https://aclanthology.org/2024.lrec-main.254/) (Ding et al., LREC-COLING 2024)
ACL
- Wenjian Ding, Yao Zhang, Jun Wang, Adam Jatowt, and Zhenglu Yang. 2024. Can We Learn Question, Answer, and Distractors All from an Image? A New Task for Multiple-choice Visual Question Answering. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), pages 2852–2863, Torino, Italia. ELRA and ICCL.