@inproceedings{lin-etal-2021-limitations,
title = "Limitations of Autoregressive Models and Their Alternatives",
author = "Lin, Chu-Cheng and
Jaech, Aaron and
Li, Xin and
Gormley, Matthew R. and
Eisner, Jason",
editor = "Toutanova, Kristina and
Rumshisky, Anna and
Zettlemoyer, Luke and
Hakkani-Tur, Dilek and
Beltagy, Iz and
Bethard, Steven and
Cotterell, Ryan and
Chakraborty, Tanmoy and
Zhou, Yichao",
booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies",
month = jun,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.naacl-main.405",
doi = "10.18653/v1/2021.naacl-main.405",
pages = "5147--5173",
abstract = "Standard autoregressive language models perform only polynomial-time computation to compute the probability of the next symbol. While this is attractive, it means they cannot model distributions whose next-symbol probability is \textit{hard} to compute. Indeed, they cannot even model them well enough to solve associated \textit{easy} decision problems for which an engineer might want to consult a language model. These limitations apply no matter how much computation and data are used to train the model, unless the model is given access to oracle parameters that grow \textit{superpolynomially} in sequence length. Thus, simply training larger autoregressive language models is not a panacea for NLP. Alternatives include energy-based models (which give up efficient sampling) and latent-variable autoregressive models (which give up efficient scoring of a given string). Both are powerful enough to escape the above limitations.",
}