@inproceedings{svete-etal-2024-transformers,
title = "Can Transformers Learn $n$-gram Language Models?",
author = "Svete, Anej and
Borenstein, Nadav and
Zhou, Mike and
Augenstein, Isabelle and
Cotterell, Ryan",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.550/",
doi = "10.18653/v1/2024.emnlp-main.550",
pages = "9851--9867",
abstract = "Much theoretical work has described the ability of transformers to represent formal languages. However, linking theoretical results to empirical performance is not straightforward due to the complex interplay between the architecture, the learning algorithm, and training data. To test whether theoretical lower bounds imply \textit{learnability} of formal languages, we turn to recent work relating transformers to $n$-gram language models (LMs). We study transformers' ability to learn random $n$-gram LMs of two kinds: ones with arbitrary next-symbol probabilities and ones where those are defined with shared parameters. We find that classic estimation techniques for $n$-gram LMs such as add-$\lambda$ smoothing outperform transformers on the former, while transformers perform better on the latter, outperforming methods specifically designed to learn $n$-gram LMs."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="svete-etal-2024-transformers">
<titleInfo>
<title>Can Transformers Learn n-gram Language Models?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anej</namePart>
<namePart type="family">Svete</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nadav</namePart>
<namePart type="family">Borenstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mike</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isabelle</namePart>
<namePart type="family">Augenstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="family">Cotterell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Much theoretical work has described the ability of transformers to represent formal languages. However, linking theoretical results to empirical performance is not straightforward due to the complex interplay between the architecture, the learning algorithm, and training data. To test whether theoretical lower bounds imply learnability of formal languages, we turn to recent work relating transformers to n-gram language models (LMs). We study transformers’ ability to learn random n-gram LMs of two kinds: ones with arbitrary next-symbol probabilities and ones where those are defined with shared parameters. We find that classic estimation techniques for n-gram LMs such as add-łambda smoothing outperform transformers on the former, while transformers perform better on the latter, outperforming methods specifically designed to learn n-gram LMs.</abstract>
<identifier type="citekey">svete-etal-2024-transformers</identifier>
<identifier type="doi">10.18653/v1/2024.emnlp-main.550</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.550/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>9851</start>
<end>9867</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Can Transformers Learn n-gram Language Models?
%A Svete, Anej
%A Borenstein, Nadav
%A Zhou, Mike
%A Augenstein, Isabelle
%A Cotterell, Ryan
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F svete-etal-2024-transformers
%X Much theoretical work has described the ability of transformers to represent formal languages. However, linking theoretical results to empirical performance is not straightforward due to the complex interplay between the architecture, the learning algorithm, and training data. To test whether theoretical lower bounds imply learnability of formal languages, we turn to recent work relating transformers to n-gram language models (LMs). We study transformers’ ability to learn random n-gram LMs of two kinds: ones with arbitrary next-symbol probabilities and ones where those are defined with shared parameters. We find that classic estimation techniques for n-gram LMs such as add-łambda smoothing outperform transformers on the former, while transformers perform better on the latter, outperforming methods specifically designed to learn n-gram LMs.
%R 10.18653/v1/2024.emnlp-main.550
%U https://aclanthology.org/2024.emnlp-main.550/
%U https://doi.org/10.18653/v1/2024.emnlp-main.550
%P 9851-9867
Markdown (Informal)
[Can Transformers Learn n-gram Language Models?](https://aclanthology.org/2024.emnlp-main.550/) (Svete et al., EMNLP 2024)
ACL
- Anej Svete, Nadav Borenstein, Mike Zhou, Isabelle Augenstein, and Ryan Cotterell. 2024. Can Transformers Learn n-gram Language Models?. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pages 9851–9867, Miami, Florida, USA. Association for Computational Linguistics.