@inproceedings{fornaciari-etal-2022-hard,
title = "Hard and Soft Evaluation of {NLP} models with {BOO}t{ST}rap {SA}mpling - {B}oo{S}t{S}a",
author = "Fornaciari, Tommaso and
Uma, Alexandra and
Poesio, Massimo and
Hovy, Dirk",
editor = "Basile, Valerio and
Kozareva, Zornitsa and
Stajner, Sanja",
booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics: System Demonstrations",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.acl-demo.12/",
doi = "10.18653/v1/2022.acl-demo.12",
pages = "127--134",
abstract = "Natural Language Processing (NLP) {\textquoteleft}s applied nature makes it necessary to select the most effective and robust models. Producing slightly higher performance is insufficient; we want to know whether this advantage will carry over to other data sets. Bootstrapped significance tests can indicate that ability. So while necessary, computing the significance of models' performance differences has many levels of complexity. It can be tedious, especially when the experimental design has many conditions to compare and several runs of experiments. We present BooStSa, a tool that makes it easy to compute significance levels with the BOOtSTrap SAmpling procedure to evaluate models that predict not only standard hard labels but soft-labels (i.e., probability distributions over different classes) as well."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="fornaciari-etal-2022-hard">
<titleInfo>
<title>Hard and Soft Evaluation of NLP models with BOOtSTrap SAmpling - BooStSa</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tommaso</namePart>
<namePart type="family">Fornaciari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexandra</namePart>
<namePart type="family">Uma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Massimo</namePart>
<namePart type="family">Poesio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dirk</namePart>
<namePart type="family">Hovy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics: System Demonstrations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Valerio</namePart>
<namePart type="family">Basile</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sanja</namePart>
<namePart type="family">Stajner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Natural Language Processing (NLP) ‘s applied nature makes it necessary to select the most effective and robust models. Producing slightly higher performance is insufficient; we want to know whether this advantage will carry over to other data sets. Bootstrapped significance tests can indicate that ability. So while necessary, computing the significance of models’ performance differences has many levels of complexity. It can be tedious, especially when the experimental design has many conditions to compare and several runs of experiments. We present BooStSa, a tool that makes it easy to compute significance levels with the BOOtSTrap SAmpling procedure to evaluate models that predict not only standard hard labels but soft-labels (i.e., probability distributions over different classes) as well.</abstract>
<identifier type="citekey">fornaciari-etal-2022-hard</identifier>
<identifier type="doi">10.18653/v1/2022.acl-demo.12</identifier>
<location>
<url>https://aclanthology.org/2022.acl-demo.12/</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>127</start>
<end>134</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Hard and Soft Evaluation of NLP models with BOOtSTrap SAmpling - BooStSa
%A Fornaciari, Tommaso
%A Uma, Alexandra
%A Poesio, Massimo
%A Hovy, Dirk
%Y Basile, Valerio
%Y Kozareva, Zornitsa
%Y Stajner, Sanja
%S Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics: System Demonstrations
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F fornaciari-etal-2022-hard
%X Natural Language Processing (NLP) ‘s applied nature makes it necessary to select the most effective and robust models. Producing slightly higher performance is insufficient; we want to know whether this advantage will carry over to other data sets. Bootstrapped significance tests can indicate that ability. So while necessary, computing the significance of models’ performance differences has many levels of complexity. It can be tedious, especially when the experimental design has many conditions to compare and several runs of experiments. We present BooStSa, a tool that makes it easy to compute significance levels with the BOOtSTrap SAmpling procedure to evaluate models that predict not only standard hard labels but soft-labels (i.e., probability distributions over different classes) as well.
%R 10.18653/v1/2022.acl-demo.12
%U https://aclanthology.org/2022.acl-demo.12/
%U https://doi.org/10.18653/v1/2022.acl-demo.12
%P 127-134
Markdown (Informal)
[Hard and Soft Evaluation of NLP models with BOOtSTrap SAmpling - BooStSa](https://aclanthology.org/2022.acl-demo.12/) (Fornaciari et al., ACL 2022)
ACL