@inproceedings{lee-etal-2021-qace-asking,
title = "{QACE}: Asking Questions to Evaluate an Image Caption",
author = "Lee, Hwanhee and
Scialom, Thomas and
Yoon, Seunghyun and
Dernoncourt, Franck and
Jung, Kyomin",
editor = "Moens, Marie-Francine and
Huang, Xuanjing and
Specia, Lucia and
Yih, Scott Wen-tau",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2021",
month = nov,
year = "2021",
address = "Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.findings-emnlp.395",
doi = "10.18653/v1/2021.findings-emnlp.395",
pages = "4631--4638",
abstract = "In this paper we propose QACE, a new metric based on Question Answering for Caption Evaluation to evaluate image captioning based on Question Generation(QG) and Question Answering(QA) systems. QACE generates questions on the evaluated caption and check its content by asking the questions on either the reference caption or the source image. We first develop QACE{\_}Ref that compares the answers of the evaluated caption to its reference, and report competitive results with the state-of-the-art metrics. To go further, we propose QACE{\_}Img, that asks the questions directly on the image, instead of reference. A Visual-QA system is necessary for QACE{\_}Img. Unfortunately, the standard VQA models are actually framed a classification among only few thousands categories. Instead, we propose Visual-T5, an abstractive VQA system. The resulting metric, QACE{\_}Img is multi-modal, reference-less and explainable. Our experiments show that QACE{\_}Img compares favorably w.r.t. other reference-less metrics.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lee-etal-2021-qace-asking">
<titleInfo>
<title>QACE: Asking Questions to Evaluate an Image Caption</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hwanhee</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Scialom</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seunghyun</namePart>
<namePart type="family">Yoon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Franck</namePart>
<namePart type="family">Dernoncourt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kyomin</namePart>
<namePart type="family">Jung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2021</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marie-Francine</namePart>
<namePart type="family">Moens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuanjing</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="family">Specia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Scott</namePart>
<namePart type="given">Wen-tau</namePart>
<namePart type="family">Yih</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Punta Cana, Dominican Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper we propose QACE, a new metric based on Question Answering for Caption Evaluation to evaluate image captioning based on Question Generation(QG) and Question Answering(QA) systems. QACE generates questions on the evaluated caption and check its content by asking the questions on either the reference caption or the source image. We first develop QACE_Ref that compares the answers of the evaluated caption to its reference, and report competitive results with the state-of-the-art metrics. To go further, we propose QACE_Img, that asks the questions directly on the image, instead of reference. A Visual-QA system is necessary for QACE_Img. Unfortunately, the standard VQA models are actually framed a classification among only few thousands categories. Instead, we propose Visual-T5, an abstractive VQA system. The resulting metric, QACE_Img is multi-modal, reference-less and explainable. Our experiments show that QACE_Img compares favorably w.r.t. other reference-less metrics.</abstract>
<identifier type="citekey">lee-etal-2021-qace-asking</identifier>
<identifier type="doi">10.18653/v1/2021.findings-emnlp.395</identifier>
<location>
<url>https://aclanthology.org/2021.findings-emnlp.395</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>4631</start>
<end>4638</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T QACE: Asking Questions to Evaluate an Image Caption
%A Lee, Hwanhee
%A Scialom, Thomas
%A Yoon, Seunghyun
%A Dernoncourt, Franck
%A Jung, Kyomin
%Y Moens, Marie-Francine
%Y Huang, Xuanjing
%Y Specia, Lucia
%Y Yih, Scott Wen-tau
%S Findings of the Association for Computational Linguistics: EMNLP 2021
%D 2021
%8 November
%I Association for Computational Linguistics
%C Punta Cana, Dominican Republic
%F lee-etal-2021-qace-asking
%X In this paper we propose QACE, a new metric based on Question Answering for Caption Evaluation to evaluate image captioning based on Question Generation(QG) and Question Answering(QA) systems. QACE generates questions on the evaluated caption and check its content by asking the questions on either the reference caption or the source image. We first develop QACE_Ref that compares the answers of the evaluated caption to its reference, and report competitive results with the state-of-the-art metrics. To go further, we propose QACE_Img, that asks the questions directly on the image, instead of reference. A Visual-QA system is necessary for QACE_Img. Unfortunately, the standard VQA models are actually framed a classification among only few thousands categories. Instead, we propose Visual-T5, an abstractive VQA system. The resulting metric, QACE_Img is multi-modal, reference-less and explainable. Our experiments show that QACE_Img compares favorably w.r.t. other reference-less metrics.
%R 10.18653/v1/2021.findings-emnlp.395
%U https://aclanthology.org/2021.findings-emnlp.395
%U https://doi.org/10.18653/v1/2021.findings-emnlp.395
%P 4631-4638
Markdown (Informal)
[QACE: Asking Questions to Evaluate an Image Caption](https://aclanthology.org/2021.findings-emnlp.395) (Lee et al., Findings 2021)
ACL
- Hwanhee Lee, Thomas Scialom, Seunghyun Yoon, Franck Dernoncourt, and Kyomin Jung. 2021. QACE: Asking Questions to Evaluate an Image Caption. In Findings of the Association for Computational Linguistics: EMNLP 2021, pages 4631–4638, Punta Cana, Dominican Republic. Association for Computational Linguistics.