@inproceedings{sanyal-etal-2022-robustlr,
title = "{R}obust{LR}: A Diagnostic Benchmark for Evaluating Logical Robustness of Deductive Reasoners",
author = "Sanyal, Soumya and
Liao, Zeyi and
Ren, Xiang",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.emnlp-main.653/",
doi = "10.18653/v1/2022.emnlp-main.653",
pages = "9614--9631",
abstract = "Transformers have been shown to be able to perform deductive reasoning on inputs containing rules and statements written in the English natural language. However, it is unclear if these models indeed follow rigorous logical reasoning to arrive at the prediction or rely on spurious correlation patterns in making decisions. A strong deductive reasoning model should consistently understand the semantics of different logical operators. To this end, we present RobustLR, a diagnostic benchmark that evaluates the robustness of language models to minimal logical edits in the inputs and different logical equivalence conditions. In our experiments with RoBERTa, T5, and GPT3 we show that the models trained on deductive reasoning datasets do not perform consistently on the RobustLR test set, thus showing that the models are not robust to our proposed logical perturbations. Further, we observe that the models find it especially hard to learn logical negation operators. Our results demonstrate the shortcomings of current language models in logical reasoning and call for the development of better inductive biases to teach the logical semantics to language models. All the datasets and code base have been made publicly available."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sanyal-etal-2022-robustlr">
<titleInfo>
<title>RobustLR: A Diagnostic Benchmark for Evaluating Logical Robustness of Deductive Reasoners</title>
</titleInfo>
<name type="personal">
<namePart type="given">Soumya</namePart>
<namePart type="family">Sanyal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zeyi</namePart>
<namePart type="family">Liao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiang</namePart>
<namePart type="family">Ren</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Transformers have been shown to be able to perform deductive reasoning on inputs containing rules and statements written in the English natural language. However, it is unclear if these models indeed follow rigorous logical reasoning to arrive at the prediction or rely on spurious correlation patterns in making decisions. A strong deductive reasoning model should consistently understand the semantics of different logical operators. To this end, we present RobustLR, a diagnostic benchmark that evaluates the robustness of language models to minimal logical edits in the inputs and different logical equivalence conditions. In our experiments with RoBERTa, T5, and GPT3 we show that the models trained on deductive reasoning datasets do not perform consistently on the RobustLR test set, thus showing that the models are not robust to our proposed logical perturbations. Further, we observe that the models find it especially hard to learn logical negation operators. Our results demonstrate the shortcomings of current language models in logical reasoning and call for the development of better inductive biases to teach the logical semantics to language models. All the datasets and code base have been made publicly available.</abstract>
<identifier type="citekey">sanyal-etal-2022-robustlr</identifier>
<identifier type="doi">10.18653/v1/2022.emnlp-main.653</identifier>
<location>
<url>https://aclanthology.org/2022.emnlp-main.653/</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>9614</start>
<end>9631</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T RobustLR: A Diagnostic Benchmark for Evaluating Logical Robustness of Deductive Reasoners
%A Sanyal, Soumya
%A Liao, Zeyi
%A Ren, Xiang
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F sanyal-etal-2022-robustlr
%X Transformers have been shown to be able to perform deductive reasoning on inputs containing rules and statements written in the English natural language. However, it is unclear if these models indeed follow rigorous logical reasoning to arrive at the prediction or rely on spurious correlation patterns in making decisions. A strong deductive reasoning model should consistently understand the semantics of different logical operators. To this end, we present RobustLR, a diagnostic benchmark that evaluates the robustness of language models to minimal logical edits in the inputs and different logical equivalence conditions. In our experiments with RoBERTa, T5, and GPT3 we show that the models trained on deductive reasoning datasets do not perform consistently on the RobustLR test set, thus showing that the models are not robust to our proposed logical perturbations. Further, we observe that the models find it especially hard to learn logical negation operators. Our results demonstrate the shortcomings of current language models in logical reasoning and call for the development of better inductive biases to teach the logical semantics to language models. All the datasets and code base have been made publicly available.
%R 10.18653/v1/2022.emnlp-main.653
%U https://aclanthology.org/2022.emnlp-main.653/
%U https://doi.org/10.18653/v1/2022.emnlp-main.653
%P 9614-9631
Markdown (Informal)
[RobustLR: A Diagnostic Benchmark for Evaluating Logical Robustness of Deductive Reasoners](https://aclanthology.org/2022.emnlp-main.653/) (Sanyal et al., EMNLP 2022)
ACL