@article{jiang-etal-2024-addressing,
title = "Addressing the Binning Problem in Calibration Assessment through Scalar Annotations",
author = "Jiang, Zhengping and
Liu, Anqi and
Van Durme, Benjamin",
journal = "Transactions of the Association for Computational Linguistics",
volume = "12",
year = "2024",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2024.tacl-1.7",
doi = "10.1162/tacl_a_00636",
pages = "120--136",
abstract = "Computational linguistics models commonly target the prediction of discrete{---}categorical{---}labels. When assessing how well-calibrated these model predictions are, popular evaluation schemes require practitioners to manually determine a binning scheme: grouping labels into bins to approximate true label posterior. The problem is that these metrics are sensitive to binning decisions. We consider two solutions to the binning problem that apply at the stage of data annotation: collecting either distributed (redundant) labels or direct scalar value assignment. In this paper, we show that although both approaches address the binning problem by evaluating instance-level calibration, direct scalar assignment is significantly more cost-effective. We provide theoretical analysis and empirical evidence to support our proposal for dataset creators to adopt scalar annotation protocols to enable a higher-quality assessment of model calibration.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jiang-etal-2024-addressing">
<titleInfo>
<title>Addressing the Binning Problem in Calibration Assessment through Scalar Annotations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhengping</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anqi</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Benjamin</namePart>
<namePart type="family">Van Durme</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Computational linguistics models commonly target the prediction of discrete—categorical—labels. When assessing how well-calibrated these model predictions are, popular evaluation schemes require practitioners to manually determine a binning scheme: grouping labels into bins to approximate true label posterior. The problem is that these metrics are sensitive to binning decisions. We consider two solutions to the binning problem that apply at the stage of data annotation: collecting either distributed (redundant) labels or direct scalar value assignment. In this paper, we show that although both approaches address the binning problem by evaluating instance-level calibration, direct scalar assignment is significantly more cost-effective. We provide theoretical analysis and empirical evidence to support our proposal for dataset creators to adopt scalar annotation protocols to enable a higher-quality assessment of model calibration.</abstract>
<identifier type="citekey">jiang-etal-2024-addressing</identifier>
<identifier type="doi">10.1162/tacl_a_00636</identifier>
<location>
<url>https://aclanthology.org/2024.tacl-1.7</url>
</location>
<part>
<date>2024</date>
<detail type="volume"><number>12</number></detail>
<extent unit="page">
<start>120</start>
<end>136</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Addressing the Binning Problem in Calibration Assessment through Scalar Annotations
%A Jiang, Zhengping
%A Liu, Anqi
%A Van Durme, Benjamin
%J Transactions of the Association for Computational Linguistics
%D 2024
%V 12
%I MIT Press
%C Cambridge, MA
%F jiang-etal-2024-addressing
%X Computational linguistics models commonly target the prediction of discrete—categorical—labels. When assessing how well-calibrated these model predictions are, popular evaluation schemes require practitioners to manually determine a binning scheme: grouping labels into bins to approximate true label posterior. The problem is that these metrics are sensitive to binning decisions. We consider two solutions to the binning problem that apply at the stage of data annotation: collecting either distributed (redundant) labels or direct scalar value assignment. In this paper, we show that although both approaches address the binning problem by evaluating instance-level calibration, direct scalar assignment is significantly more cost-effective. We provide theoretical analysis and empirical evidence to support our proposal for dataset creators to adopt scalar annotation protocols to enable a higher-quality assessment of model calibration.
%R 10.1162/tacl_a_00636
%U https://aclanthology.org/2024.tacl-1.7
%U https://doi.org/10.1162/tacl_a_00636
%P 120-136
Markdown (Informal)
[Addressing the Binning Problem in Calibration Assessment through Scalar Annotations](https://aclanthology.org/2024.tacl-1.7) (Jiang et al., TACL 2024)
ACL