@article{luz-de-araujo-roth-2023-cross,
title = "Cross-functional Analysis of Generalization in Behavioral Learning",
author = "Luz de Araujo, Pedro Henrique and
Roth, Benjamin",
journal = "Transactions of the Association for Computational Linguistics",
volume = "11",
year = "2023",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2023.tacl-1.60/",
doi = "10.1162/tacl_a_00590",
pages = "1066--1081",
abstract = "In behavioral testing, system functionalities underrepresented in the standard evaluation setting (with a held-out test set) are validated through controlled input-output pairs. Optimizing performance on the behavioral tests during training (behavioral learning) would improve coverage of phenomena not sufficiently represented in the i.i.d. data and could lead to seemingly more robust models. However, there is the risk that the model narrowly captures spurious correlations from the behavioral test suite, leading to overestimation and misrepresentation of model performance{---}one of the original pitfalls of traditional evaluation. In this work, we introduce BeLUGA, an analysis method for evaluating behavioral learning considering generalization across dimensions of different granularity levels. We optimize behavior-specific loss functions and evaluate models on several partitions of the behavioral test suite controlled to leave out specific phenomena. An aggregate score measures generalization to unseen functionalities (or overfitting). We use BeLUGA to examine three representative NLP tasks (sentiment analysis, paraphrase identification, and reading comprehension) and compare the impact of a diverse set of regularization and domain generalization methods on generalization performance.1"
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="luz-de-araujo-roth-2023-cross">
<titleInfo>
<title>Cross-functional Analysis of Generalization in Behavioral Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pedro</namePart>
<namePart type="given">Henrique</namePart>
<namePart type="family">Luz de Araujo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Benjamin</namePart>
<namePart type="family">Roth</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>In behavioral testing, system functionalities underrepresented in the standard evaluation setting (with a held-out test set) are validated through controlled input-output pairs. Optimizing performance on the behavioral tests during training (behavioral learning) would improve coverage of phenomena not sufficiently represented in the i.i.d. data and could lead to seemingly more robust models. However, there is the risk that the model narrowly captures spurious correlations from the behavioral test suite, leading to overestimation and misrepresentation of model performance—one of the original pitfalls of traditional evaluation. In this work, we introduce BeLUGA, an analysis method for evaluating behavioral learning considering generalization across dimensions of different granularity levels. We optimize behavior-specific loss functions and evaluate models on several partitions of the behavioral test suite controlled to leave out specific phenomena. An aggregate score measures generalization to unseen functionalities (or overfitting). We use BeLUGA to examine three representative NLP tasks (sentiment analysis, paraphrase identification, and reading comprehension) and compare the impact of a diverse set of regularization and domain generalization methods on generalization performance.1</abstract>
<identifier type="citekey">luz-de-araujo-roth-2023-cross</identifier>
<identifier type="doi">10.1162/tacl_a_00590</identifier>
<location>
<url>https://aclanthology.org/2023.tacl-1.60/</url>
</location>
<part>
<date>2023</date>
<detail type="volume"><number>11</number></detail>
<extent unit="page">
<start>1066</start>
<end>1081</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Cross-functional Analysis of Generalization in Behavioral Learning
%A Luz de Araujo, Pedro Henrique
%A Roth, Benjamin
%J Transactions of the Association for Computational Linguistics
%D 2023
%V 11
%I MIT Press
%C Cambridge, MA
%F luz-de-araujo-roth-2023-cross
%X In behavioral testing, system functionalities underrepresented in the standard evaluation setting (with a held-out test set) are validated through controlled input-output pairs. Optimizing performance on the behavioral tests during training (behavioral learning) would improve coverage of phenomena not sufficiently represented in the i.i.d. data and could lead to seemingly more robust models. However, there is the risk that the model narrowly captures spurious correlations from the behavioral test suite, leading to overestimation and misrepresentation of model performance—one of the original pitfalls of traditional evaluation. In this work, we introduce BeLUGA, an analysis method for evaluating behavioral learning considering generalization across dimensions of different granularity levels. We optimize behavior-specific loss functions and evaluate models on several partitions of the behavioral test suite controlled to leave out specific phenomena. An aggregate score measures generalization to unseen functionalities (or overfitting). We use BeLUGA to examine three representative NLP tasks (sentiment analysis, paraphrase identification, and reading comprehension) and compare the impact of a diverse set of regularization and domain generalization methods on generalization performance.1
%R 10.1162/tacl_a_00590
%U https://aclanthology.org/2023.tacl-1.60/
%U https://doi.org/10.1162/tacl_a_00590
%P 1066-1081
Markdown (Informal)
[Cross-functional Analysis of Generalization in Behavioral Learning](https://aclanthology.org/2023.tacl-1.60/) (Luz de Araujo & Roth, TACL 2023)
ACL