@inproceedings{urailertprasert-etal-2024-sea,
title = "{SEA}-{VQA}: {S}outheast {A}sian Cultural Context Dataset For Visual Question Answering",
author = "Urailertprasert, Norawit and
Limkonchotiwat, Peerat and
Suwajanakorn, Supasorn and
Nutanong, Sarana",
editor = "Gu, Jing and
Fu, Tsu-Jui (Ray) and
Hudson, Drew and
Celikyilmaz, Asli and
Wang, William",
booktitle = "Proceedings of the 3rd Workshop on Advances in Language and Vision Research (ALVR)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.alvr-1.15/",
doi = "10.18653/v1/2024.alvr-1.15",
pages = "173--185",
abstract = "Visual Question Answering (VQA) is a critical task that requires the simultaneous understanding of visual and textual information. While significant advancements have been made with multilingual datasets, these often lack cultural specificity, especially in the context of Southeast Asia (SEA). In this paper, we introduce SEA-VQA aiming to highlight the challenges and gaps in existing VQA models when confronted with culturally specific content. Our dataset includes images from eight SEA countries, curated from the UNESCO Cultural Heritage collection. Our evaluation, comparing GPT-4 and GEMINI models, demonstrates substantial performance drops on culture-centric questions compared to the A-OKVQA dataset, a commonsense and world-knowledge VQA benchmark comprising approximately 25,000 questions. Our findings underscore the importance of cultural diversity in VQA datasets and reveal substantial gaps in the ability of current VQA models to handle culturally rich contexts. SEA-VQA serves as a crucial benchmark for identifying these gaps and guiding future improvements in VQA systems."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="urailertprasert-etal-2024-sea">
<titleInfo>
<title>SEA-VQA: Southeast Asian Cultural Context Dataset For Visual Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Norawit</namePart>
<namePart type="family">Urailertprasert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peerat</namePart>
<namePart type="family">Limkonchotiwat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Supasorn</namePart>
<namePart type="family">Suwajanakorn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sarana</namePart>
<namePart type="family">Nutanong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 3rd Workshop on Advances in Language and Vision Research (ALVR)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Gu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tsu-Jui</namePart>
<namePart type="given">(Ray)</namePart>
<namePart type="family">Fu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Drew</namePart>
<namePart type="family">Hudson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asli</namePart>
<namePart type="family">Celikyilmaz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">William</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Visual Question Answering (VQA) is a critical task that requires the simultaneous understanding of visual and textual information. While significant advancements have been made with multilingual datasets, these often lack cultural specificity, especially in the context of Southeast Asia (SEA). In this paper, we introduce SEA-VQA aiming to highlight the challenges and gaps in existing VQA models when confronted with culturally specific content. Our dataset includes images from eight SEA countries, curated from the UNESCO Cultural Heritage collection. Our evaluation, comparing GPT-4 and GEMINI models, demonstrates substantial performance drops on culture-centric questions compared to the A-OKVQA dataset, a commonsense and world-knowledge VQA benchmark comprising approximately 25,000 questions. Our findings underscore the importance of cultural diversity in VQA datasets and reveal substantial gaps in the ability of current VQA models to handle culturally rich contexts. SEA-VQA serves as a crucial benchmark for identifying these gaps and guiding future improvements in VQA systems.</abstract>
<identifier type="citekey">urailertprasert-etal-2024-sea</identifier>
<identifier type="doi">10.18653/v1/2024.alvr-1.15</identifier>
<location>
<url>https://aclanthology.org/2024.alvr-1.15/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>173</start>
<end>185</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SEA-VQA: Southeast Asian Cultural Context Dataset For Visual Question Answering
%A Urailertprasert, Norawit
%A Limkonchotiwat, Peerat
%A Suwajanakorn, Supasorn
%A Nutanong, Sarana
%Y Gu, Jing
%Y Fu, Tsu-Jui (Ray)
%Y Hudson, Drew
%Y Celikyilmaz, Asli
%Y Wang, William
%S Proceedings of the 3rd Workshop on Advances in Language and Vision Research (ALVR)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F urailertprasert-etal-2024-sea
%X Visual Question Answering (VQA) is a critical task that requires the simultaneous understanding of visual and textual information. While significant advancements have been made with multilingual datasets, these often lack cultural specificity, especially in the context of Southeast Asia (SEA). In this paper, we introduce SEA-VQA aiming to highlight the challenges and gaps in existing VQA models when confronted with culturally specific content. Our dataset includes images from eight SEA countries, curated from the UNESCO Cultural Heritage collection. Our evaluation, comparing GPT-4 and GEMINI models, demonstrates substantial performance drops on culture-centric questions compared to the A-OKVQA dataset, a commonsense and world-knowledge VQA benchmark comprising approximately 25,000 questions. Our findings underscore the importance of cultural diversity in VQA datasets and reveal substantial gaps in the ability of current VQA models to handle culturally rich contexts. SEA-VQA serves as a crucial benchmark for identifying these gaps and guiding future improvements in VQA systems.
%R 10.18653/v1/2024.alvr-1.15
%U https://aclanthology.org/2024.alvr-1.15/
%U https://doi.org/10.18653/v1/2024.alvr-1.15
%P 173-185
Markdown (Informal)
[SEA-VQA: Southeast Asian Cultural Context Dataset For Visual Question Answering](https://aclanthology.org/2024.alvr-1.15/) (Urailertprasert et al., ALVR 2024)
ACL