@inproceedings{si-etal-2022-examining,
title = "Re-Examining Calibration: The Case of Question Answering",
author = "Si, Chenglei and
Zhao, Chen and
Min, Sewon and
Boyd-Graber, Jordan",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2022",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-emnlp.204/",
doi = "10.18653/v1/2022.findings-emnlp.204",
pages = "2814--2829",
abstract = "For users to trust model predictions, they need to understand model outputs, particularly their confidence {---} calibration aims to adjust (calibrate) models' confidence to match expected accuracy. We argue that the traditional calibration evaluation does not promote effective calibrations: for example, it can encourage always assigning a mediocre confidence score to all predictions, which does not help users distinguish correct predictions from wrong ones. Building on those observations, we propose a new calibration metric, MacroCE, that better captures whether the model assigns low confidence to wrong predictions and high confidence to correct predictions. Focusing on the practical application of open-domain question answering, we examine conventional calibration methods applied on the widely-used retriever-reader pipeline, all of which do not bring significant gains under our new MacroCE metric. Toward better calibration, we propose a new calibration method (ConsCal) that uses not just final model predictions but whether multiple model checkpoints make consistent predictions. Altogether, we provide an alternative view of calibration along with a new metric, re-evaluation of existing calibration methods on our metric, and proposal of a more effective calibration method."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="si-etal-2022-examining">
<titleInfo>
<title>Re-Examining Calibration: The Case of Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chenglei</namePart>
<namePart type="family">Si</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chen</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sewon</namePart>
<namePart type="family">Min</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>For users to trust model predictions, they need to understand model outputs, particularly their confidence — calibration aims to adjust (calibrate) models’ confidence to match expected accuracy. We argue that the traditional calibration evaluation does not promote effective calibrations: for example, it can encourage always assigning a mediocre confidence score to all predictions, which does not help users distinguish correct predictions from wrong ones. Building on those observations, we propose a new calibration metric, MacroCE, that better captures whether the model assigns low confidence to wrong predictions and high confidence to correct predictions. Focusing on the practical application of open-domain question answering, we examine conventional calibration methods applied on the widely-used retriever-reader pipeline, all of which do not bring significant gains under our new MacroCE metric. Toward better calibration, we propose a new calibration method (ConsCal) that uses not just final model predictions but whether multiple model checkpoints make consistent predictions. Altogether, we provide an alternative view of calibration along with a new metric, re-evaluation of existing calibration methods on our metric, and proposal of a more effective calibration method.</abstract>
<identifier type="citekey">si-etal-2022-examining</identifier>
<identifier type="doi">10.18653/v1/2022.findings-emnlp.204</identifier>
<location>
<url>https://aclanthology.org/2022.findings-emnlp.204/</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>2814</start>
<end>2829</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Re-Examining Calibration: The Case of Question Answering
%A Si, Chenglei
%A Zhao, Chen
%A Min, Sewon
%A Boyd-Graber, Jordan
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Findings of the Association for Computational Linguistics: EMNLP 2022
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F si-etal-2022-examining
%X For users to trust model predictions, they need to understand model outputs, particularly their confidence — calibration aims to adjust (calibrate) models’ confidence to match expected accuracy. We argue that the traditional calibration evaluation does not promote effective calibrations: for example, it can encourage always assigning a mediocre confidence score to all predictions, which does not help users distinguish correct predictions from wrong ones. Building on those observations, we propose a new calibration metric, MacroCE, that better captures whether the model assigns low confidence to wrong predictions and high confidence to correct predictions. Focusing on the practical application of open-domain question answering, we examine conventional calibration methods applied on the widely-used retriever-reader pipeline, all of which do not bring significant gains under our new MacroCE metric. Toward better calibration, we propose a new calibration method (ConsCal) that uses not just final model predictions but whether multiple model checkpoints make consistent predictions. Altogether, we provide an alternative view of calibration along with a new metric, re-evaluation of existing calibration methods on our metric, and proposal of a more effective calibration method.
%R 10.18653/v1/2022.findings-emnlp.204
%U https://aclanthology.org/2022.findings-emnlp.204/
%U https://doi.org/10.18653/v1/2022.findings-emnlp.204
%P 2814-2829
Markdown (Informal)
[Re-Examining Calibration: The Case of Question Answering](https://aclanthology.org/2022.findings-emnlp.204/) (Si et al., Findings 2022)
ACL
- Chenglei Si, Chen Zhao, Sewon Min, and Jordan Boyd-Graber. 2022. Re-Examining Calibration: The Case of Question Answering. In Findings of the Association for Computational Linguistics: EMNLP 2022, pages 2814–2829, Abu Dhabi, United Arab Emirates. Association for Computational Linguistics.