@inproceedings{haviv-etal-2022-transformer,
title = "Transformer Language Models without Positional Encodings Still Learn Positional Information",
author = "Haviv, Adi and
Ram, Ori and
Press, Ofir and
Izsak, Peter and
Levy, Omer",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2022",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-emnlp.99",
doi = "10.18653/v1/2022.findings-emnlp.99",
pages = "1382--1390",
abstract = "Causal transformer language models (LMs), such as GPT-3, typically require some form of positional encoding, such as positional embeddings. However, we show that LMs without any explicit positional encoding are still competitive with standard models and that this phenomenon is robust across different datasets, model sizes, and sequence lengths.Probing experiments reveal that such models acquire an implicit notion of absolute positions throughout the network, effectively compensating for the missing information.We conjecture that causal attention enables the model to infer the number of predecessors that each token can attend to, thereby approximating its absolute position.Our findings indicate that causal LMs might derive positional awareness not only from the explicit positioning mechanism but also from the effects of the causal mask.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="haviv-etal-2022-transformer">
<titleInfo>
<title>Transformer Language Models without Positional Encodings Still Learn Positional Information</title>
</titleInfo>
<name type="personal">
<namePart type="given">Adi</namePart>
<namePart type="family">Haviv</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ori</namePart>
<namePart type="family">Ram</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ofir</namePart>
<namePart type="family">Press</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peter</namePart>
<namePart type="family">Izsak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Omer</namePart>
<namePart type="family">Levy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Causal transformer language models (LMs), such as GPT-3, typically require some form of positional encoding, such as positional embeddings. However, we show that LMs without any explicit positional encoding are still competitive with standard models and that this phenomenon is robust across different datasets, model sizes, and sequence lengths.Probing experiments reveal that such models acquire an implicit notion of absolute positions throughout the network, effectively compensating for the missing information.We conjecture that causal attention enables the model to infer the number of predecessors that each token can attend to, thereby approximating its absolute position.Our findings indicate that causal LMs might derive positional awareness not only from the explicit positioning mechanism but also from the effects of the causal mask.</abstract>
<identifier type="citekey">haviv-etal-2022-transformer</identifier>
<identifier type="doi">10.18653/v1/2022.findings-emnlp.99</identifier>
<location>
<url>https://aclanthology.org/2022.findings-emnlp.99</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>1382</start>
<end>1390</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Transformer Language Models without Positional Encodings Still Learn Positional Information
%A Haviv, Adi
%A Ram, Ori
%A Press, Ofir
%A Izsak, Peter
%A Levy, Omer
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Findings of the Association for Computational Linguistics: EMNLP 2022
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F haviv-etal-2022-transformer
%X Causal transformer language models (LMs), such as GPT-3, typically require some form of positional encoding, such as positional embeddings. However, we show that LMs without any explicit positional encoding are still competitive with standard models and that this phenomenon is robust across different datasets, model sizes, and sequence lengths.Probing experiments reveal that such models acquire an implicit notion of absolute positions throughout the network, effectively compensating for the missing information.We conjecture that causal attention enables the model to infer the number of predecessors that each token can attend to, thereby approximating its absolute position.Our findings indicate that causal LMs might derive positional awareness not only from the explicit positioning mechanism but also from the effects of the causal mask.
%R 10.18653/v1/2022.findings-emnlp.99
%U https://aclanthology.org/2022.findings-emnlp.99
%U https://doi.org/10.18653/v1/2022.findings-emnlp.99
%P 1382-1390
Markdown (Informal)
[Transformer Language Models without Positional Encodings Still Learn Positional Information](https://aclanthology.org/2022.findings-emnlp.99) (Haviv et al., Findings 2022)
ACL