@inproceedings{sachidananda-etal-2021-efficient,
title = "Efficient Domain Adaptation of Language Models via Adaptive Tokenization",
author = "Sachidananda, Vin and
Kessler, Jason and
Lai, Yi-An",
editor = "Moosavi, Nafise Sadat and
Gurevych, Iryna and
Fan, Angela and
Wolf, Thomas and
Hou, Yufang and
Marasovi{\'c}, Ana and
Ravi, Sujith",
booktitle = "Proceedings of the Second Workshop on Simple and Efficient Natural Language Processing",
month = nov,
year = "2021",
address = "Virtual",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.sustainlp-1.16/",
doi = "10.18653/v1/2021.sustainlp-1.16",
pages = "155--165",
abstract = "Contextual embedding-based language models trained on large data sets, such as BERT and RoBERTa, provide strong performance across a wide range of tasks and are ubiquitous in modern NLP. It has been observed that fine-tuning these models on tasks involving data from domains different from that on which they were pretrained can lead to suboptimal performance. Recent work has explored approaches to adapt pretrained language models to new domains by incorporating additional pretraining on domain-specific corpora and task data. We propose an alternative approach for transferring pretrained language models to new domains by adapting their tokenizers. We show that domain-specific subword sequences can be determined efficiently directly from divergences in the conditional token distributions of the base and domain-specific corpora. In datasets from four disparate domains, we find adaptive tokenization on a pretrained RoBERTa model provides greater than 85{\%} of the performance benefits of domain specific pretraining. Our approach produces smaller models and less training and inference time than other approaches using tokenizer augmentation. Although using adaptive tokenization incurs a 6{\%} increase in model parameters (due to the introduction of 10k new domain-specific tokens), our approach, using 64 CPUs, is {\ensuremath{>}}72x faster than further pretraining the language model on domain-specific corpora on 8 TPUs."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sachidananda-etal-2021-efficient">
<titleInfo>
<title>Efficient Domain Adaptation of Language Models via Adaptive Tokenization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vin</namePart>
<namePart type="family">Sachidananda</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jason</namePart>
<namePart type="family">Kessler</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yi-An</namePart>
<namePart type="family">Lai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on Simple and Efficient Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nafise</namePart>
<namePart type="given">Sadat</namePart>
<namePart type="family">Moosavi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iryna</namePart>
<namePart type="family">Gurevych</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Angela</namePart>
<namePart type="family">Fan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Wolf</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yufang</namePart>
<namePart type="family">Hou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ana</namePart>
<namePart type="family">Marasović</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sujith</namePart>
<namePart type="family">Ravi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Virtual</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Contextual embedding-based language models trained on large data sets, such as BERT and RoBERTa, provide strong performance across a wide range of tasks and are ubiquitous in modern NLP. It has been observed that fine-tuning these models on tasks involving data from domains different from that on which they were pretrained can lead to suboptimal performance. Recent work has explored approaches to adapt pretrained language models to new domains by incorporating additional pretraining on domain-specific corpora and task data. We propose an alternative approach for transferring pretrained language models to new domains by adapting their tokenizers. We show that domain-specific subword sequences can be determined efficiently directly from divergences in the conditional token distributions of the base and domain-specific corpora. In datasets from four disparate domains, we find adaptive tokenization on a pretrained RoBERTa model provides greater than 85% of the performance benefits of domain specific pretraining. Our approach produces smaller models and less training and inference time than other approaches using tokenizer augmentation. Although using adaptive tokenization incurs a 6% increase in model parameters (due to the introduction of 10k new domain-specific tokens), our approach, using 64 CPUs, is \ensuremath>72x faster than further pretraining the language model on domain-specific corpora on 8 TPUs.</abstract>
<identifier type="citekey">sachidananda-etal-2021-efficient</identifier>
<identifier type="doi">10.18653/v1/2021.sustainlp-1.16</identifier>
<location>
<url>https://aclanthology.org/2021.sustainlp-1.16/</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>155</start>
<end>165</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Efficient Domain Adaptation of Language Models via Adaptive Tokenization
%A Sachidananda, Vin
%A Kessler, Jason
%A Lai, Yi-An
%Y Moosavi, Nafise Sadat
%Y Gurevych, Iryna
%Y Fan, Angela
%Y Wolf, Thomas
%Y Hou, Yufang
%Y Marasović, Ana
%Y Ravi, Sujith
%S Proceedings of the Second Workshop on Simple and Efficient Natural Language Processing
%D 2021
%8 November
%I Association for Computational Linguistics
%C Virtual
%F sachidananda-etal-2021-efficient
%X Contextual embedding-based language models trained on large data sets, such as BERT and RoBERTa, provide strong performance across a wide range of tasks and are ubiquitous in modern NLP. It has been observed that fine-tuning these models on tasks involving data from domains different from that on which they were pretrained can lead to suboptimal performance. Recent work has explored approaches to adapt pretrained language models to new domains by incorporating additional pretraining on domain-specific corpora and task data. We propose an alternative approach for transferring pretrained language models to new domains by adapting their tokenizers. We show that domain-specific subword sequences can be determined efficiently directly from divergences in the conditional token distributions of the base and domain-specific corpora. In datasets from four disparate domains, we find adaptive tokenization on a pretrained RoBERTa model provides greater than 85% of the performance benefits of domain specific pretraining. Our approach produces smaller models and less training and inference time than other approaches using tokenizer augmentation. Although using adaptive tokenization incurs a 6% increase in model parameters (due to the introduction of 10k new domain-specific tokens), our approach, using 64 CPUs, is \ensuremath>72x faster than further pretraining the language model on domain-specific corpora on 8 TPUs.
%R 10.18653/v1/2021.sustainlp-1.16
%U https://aclanthology.org/2021.sustainlp-1.16/
%U https://doi.org/10.18653/v1/2021.sustainlp-1.16
%P 155-165
Markdown (Informal)
[Efficient Domain Adaptation of Language Models via Adaptive Tokenization](https://aclanthology.org/2021.sustainlp-1.16/) (Sachidananda et al., sustainlp 2021)
ACL