@inproceedings{liu-etal-2020-precise,
title = "Precise Task Formalization Matters in {W}inograd Schema Evaluations",
author = "Liu, Haokun and
Huang, William and
Mungra, Dhara and
Bowman, Samuel R.",
editor = "Webber, Bonnie and
Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.emnlp-main.664",
doi = "10.18653/v1/2020.emnlp-main.664",
pages = "8275--8280",
abstract = "Performance on the Winograd Schema Challenge (WSC), a respected English commonsense reasoning benchmark, recently rocketed from chance accuracy to 89{\%} on the SuperGLUE leaderboard, with relatively little corroborating evidence of a correspondingly large improvement in reasoning ability. We hypothesize that much of this improvement comes from recent changes in task formalization{---}the combination of input specification, loss function, and reuse of pretrained parameters{---}by users of the dataset, rather than improvements in the pretrained model{'}s reasoning ability. We perform an ablation on two Winograd Schema datasets that interpolates between the formalizations used before and after this surge, and find (i) framing the task as multiple choice improves performance dramatically and (ii)several additional techniques, including the reuse of a pretrained language modeling head, can mitigate the model{'}s extreme sensitivity to hyperparameters. We urge future benchmark creators to impose additional structure to minimize the impact of formalization decisions on reported results.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liu-etal-2020-precise">
<titleInfo>
<title>Precise Task Formalization Matters in Winograd Schema Evaluations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Haokun</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">William</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dhara</namePart>
<namePart type="family">Mungra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samuel</namePart>
<namePart type="given">R</namePart>
<namePart type="family">Bowman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bonnie</namePart>
<namePart type="family">Webber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Trevor</namePart>
<namePart type="family">Cohn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Performance on the Winograd Schema Challenge (WSC), a respected English commonsense reasoning benchmark, recently rocketed from chance accuracy to 89% on the SuperGLUE leaderboard, with relatively little corroborating evidence of a correspondingly large improvement in reasoning ability. We hypothesize that much of this improvement comes from recent changes in task formalization—the combination of input specification, loss function, and reuse of pretrained parameters—by users of the dataset, rather than improvements in the pretrained model’s reasoning ability. We perform an ablation on two Winograd Schema datasets that interpolates between the formalizations used before and after this surge, and find (i) framing the task as multiple choice improves performance dramatically and (ii)several additional techniques, including the reuse of a pretrained language modeling head, can mitigate the model’s extreme sensitivity to hyperparameters. We urge future benchmark creators to impose additional structure to minimize the impact of formalization decisions on reported results.</abstract>
<identifier type="citekey">liu-etal-2020-precise</identifier>
<identifier type="doi">10.18653/v1/2020.emnlp-main.664</identifier>
<location>
<url>https://aclanthology.org/2020.emnlp-main.664</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>8275</start>
<end>8280</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Precise Task Formalization Matters in Winograd Schema Evaluations
%A Liu, Haokun
%A Huang, William
%A Mungra, Dhara
%A Bowman, Samuel R.
%Y Webber, Bonnie
%Y Cohn, Trevor
%Y He, Yulan
%Y Liu, Yang
%S Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F liu-etal-2020-precise
%X Performance on the Winograd Schema Challenge (WSC), a respected English commonsense reasoning benchmark, recently rocketed from chance accuracy to 89% on the SuperGLUE leaderboard, with relatively little corroborating evidence of a correspondingly large improvement in reasoning ability. We hypothesize that much of this improvement comes from recent changes in task formalization—the combination of input specification, loss function, and reuse of pretrained parameters—by users of the dataset, rather than improvements in the pretrained model’s reasoning ability. We perform an ablation on two Winograd Schema datasets that interpolates between the formalizations used before and after this surge, and find (i) framing the task as multiple choice improves performance dramatically and (ii)several additional techniques, including the reuse of a pretrained language modeling head, can mitigate the model’s extreme sensitivity to hyperparameters. We urge future benchmark creators to impose additional structure to minimize the impact of formalization decisions on reported results.
%R 10.18653/v1/2020.emnlp-main.664
%U https://aclanthology.org/2020.emnlp-main.664
%U https://doi.org/10.18653/v1/2020.emnlp-main.664
%P 8275-8280
Markdown (Informal)
[Precise Task Formalization Matters in Winograd Schema Evaluations](https://aclanthology.org/2020.emnlp-main.664) (Liu et al., EMNLP 2020)
ACL