@inproceedings{ro-etal-2022-scaling,
title = "Scaling Language Model Size in Cross-Device Federated Learning",
author = "Ro, Jae and
Breiner, Theresa and
McConnaughey, Lara and
Chen, Mingqing and
Suresh, Ananda and
Kumar, Shankar and
Mathews, Rajiv",
editor = "Lin, Bill Yuchen and
He, Chaoyang and
Xie, Chulin and
Mireshghallah, Fatemehsadat and
Mehrabi, Ninareh and
Li, Tian and
Soltanolkotabi, Mahdi and
Ren, Xiang",
booktitle = "Proceedings of the First Workshop on Federated Learning for Natural Language Processing (FL4NLP 2022)",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.fl4nlp-1.2/",
doi = "10.18653/v1/2022.fl4nlp-1.2",
pages = "6--20",
abstract = "Most studies in cross-device federated learning focus on small models, due to the server-client communication and on-device computation bottlenecks. In this work, we leverage various techniques for mitigating these bottlenecks to train larger language models in cross-device federated learning. With systematic applications of partial model training, quantization, efficient transfer learning, and communication-efficient optimizers, we are able to train a 21M parameter Transformer that achieves the same perplexity as that of a similarly sized LSTM with $\sim10\times$ smaller client-to-server communication cost and 11{\%} lower perplexity than smaller LSTMs commonly studied in literature."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ro-etal-2022-scaling">
<titleInfo>
<title>Scaling Language Model Size in Cross-Device Federated Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jae</namePart>
<namePart type="family">Ro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Theresa</namePart>
<namePart type="family">Breiner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lara</namePart>
<namePart type="family">McConnaughey</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mingqing</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ananda</namePart>
<namePart type="family">Suresh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shankar</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rajiv</namePart>
<namePart type="family">Mathews</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Federated Learning for Natural Language Processing (FL4NLP 2022)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bill</namePart>
<namePart type="given">Yuchen</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chaoyang</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chulin</namePart>
<namePart type="family">Xie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fatemehsadat</namePart>
<namePart type="family">Mireshghallah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ninareh</namePart>
<namePart type="family">Mehrabi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tian</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mahdi</namePart>
<namePart type="family">Soltanolkotabi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiang</namePart>
<namePart type="family">Ren</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Most studies in cross-device federated learning focus on small models, due to the server-client communication and on-device computation bottlenecks. In this work, we leverage various techniques for mitigating these bottlenecks to train larger language models in cross-device federated learning. With systematic applications of partial model training, quantization, efficient transfer learning, and communication-efficient optimizers, we are able to train a 21M parameter Transformer that achieves the same perplexity as that of a similarly sized LSTM with \sim10\times smaller client-to-server communication cost and 11% lower perplexity than smaller LSTMs commonly studied in literature.</abstract>
<identifier type="citekey">ro-etal-2022-scaling</identifier>
<identifier type="doi">10.18653/v1/2022.fl4nlp-1.2</identifier>
<location>
<url>https://aclanthology.org/2022.fl4nlp-1.2/</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>6</start>
<end>20</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Scaling Language Model Size in Cross-Device Federated Learning
%A Ro, Jae
%A Breiner, Theresa
%A McConnaughey, Lara
%A Chen, Mingqing
%A Suresh, Ananda
%A Kumar, Shankar
%A Mathews, Rajiv
%Y Lin, Bill Yuchen
%Y He, Chaoyang
%Y Xie, Chulin
%Y Mireshghallah, Fatemehsadat
%Y Mehrabi, Ninareh
%Y Li, Tian
%Y Soltanolkotabi, Mahdi
%Y Ren, Xiang
%S Proceedings of the First Workshop on Federated Learning for Natural Language Processing (FL4NLP 2022)
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F ro-etal-2022-scaling
%X Most studies in cross-device federated learning focus on small models, due to the server-client communication and on-device computation bottlenecks. In this work, we leverage various techniques for mitigating these bottlenecks to train larger language models in cross-device federated learning. With systematic applications of partial model training, quantization, efficient transfer learning, and communication-efficient optimizers, we are able to train a 21M parameter Transformer that achieves the same perplexity as that of a similarly sized LSTM with \sim10\times smaller client-to-server communication cost and 11% lower perplexity than smaller LSTMs commonly studied in literature.
%R 10.18653/v1/2022.fl4nlp-1.2
%U https://aclanthology.org/2022.fl4nlp-1.2/
%U https://doi.org/10.18653/v1/2022.fl4nlp-1.2
%P 6-20
Markdown (Informal)
[Scaling Language Model Size in Cross-Device Federated Learning](https://aclanthology.org/2022.fl4nlp-1.2/) (Ro et al., FL4NLP 2022)
ACL
- Jae Ro, Theresa Breiner, Lara McConnaughey, Mingqing Chen, Ananda Suresh, Shankar Kumar, and Rajiv Mathews. 2022. Scaling Language Model Size in Cross-Device Federated Learning. In Proceedings of the First Workshop on Federated Learning for Natural Language Processing (FL4NLP 2022), pages 6–20, Dublin, Ireland. Association for Computational Linguistics.