@inproceedings{hou-etal-2023-effects,
title = "Effects of sub-word segmentation on performance of transformer language models",
author = "Hou, Jue and
Katinskaia, Anisia and
Vu, Anh-Duc and
Yangarber, Roman",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-main.459/",
doi = "10.18653/v1/2023.emnlp-main.459",
pages = "7413--7425",
abstract = "Language modeling is a fundamental task in natural language processing, which has been thoroughly explored with various architectures and hyperparameters. However, few studies focus on the effect of sub-word segmentation on the performance of language models (LMs). In this paper, we compare GPT and BERT models trained with the statistical segmentation algorithm BPE vs. two unsupervised algorithms for morphological segmentation {---} Morfessor and StateMorph. We train the models for several languages {---} including ones with very rich morphology {---} and compare their performance with different segmentation algorithms, vocabulary sizes, and model sizes. The results show that training with morphological segmentation allows the LMs to: (1) achieve lower perplexity, (2) converge more efficiently in terms of training time, and (3) achieve equivalent or better evaluation scores on downstream tasks. Lastly, we show that (4) LMs of smaller size using morphological segmentation can perform comparably to models of larger size trained with BPE {---} both in terms of (1) perplexity and (3) scores on downstream tasks. Points (2) and (4) impact on sustainability, since they reduce the model cost; and while 2 reduces cost only in the training phase, 4 does so also in the inference phase."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hou-etal-2023-effects">
<titleInfo>
<title>Effects of sub-word segmentation on performance of transformer language models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jue</namePart>
<namePart type="family">Hou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anisia</namePart>
<namePart type="family">Katinskaia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anh-Duc</namePart>
<namePart type="family">Vu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roman</namePart>
<namePart type="family">Yangarber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Language modeling is a fundamental task in natural language processing, which has been thoroughly explored with various architectures and hyperparameters. However, few studies focus on the effect of sub-word segmentation on the performance of language models (LMs). In this paper, we compare GPT and BERT models trained with the statistical segmentation algorithm BPE vs. two unsupervised algorithms for morphological segmentation — Morfessor and StateMorph. We train the models for several languages — including ones with very rich morphology — and compare their performance with different segmentation algorithms, vocabulary sizes, and model sizes. The results show that training with morphological segmentation allows the LMs to: (1) achieve lower perplexity, (2) converge more efficiently in terms of training time, and (3) achieve equivalent or better evaluation scores on downstream tasks. Lastly, we show that (4) LMs of smaller size using morphological segmentation can perform comparably to models of larger size trained with BPE — both in terms of (1) perplexity and (3) scores on downstream tasks. Points (2) and (4) impact on sustainability, since they reduce the model cost; and while 2 reduces cost only in the training phase, 4 does so also in the inference phase.</abstract>
<identifier type="citekey">hou-etal-2023-effects</identifier>
<identifier type="doi">10.18653/v1/2023.emnlp-main.459</identifier>
<location>
<url>https://aclanthology.org/2023.emnlp-main.459/</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>7413</start>
<end>7425</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Effects of sub-word segmentation on performance of transformer language models
%A Hou, Jue
%A Katinskaia, Anisia
%A Vu, Anh-Duc
%A Yangarber, Roman
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F hou-etal-2023-effects
%X Language modeling is a fundamental task in natural language processing, which has been thoroughly explored with various architectures and hyperparameters. However, few studies focus on the effect of sub-word segmentation on the performance of language models (LMs). In this paper, we compare GPT and BERT models trained with the statistical segmentation algorithm BPE vs. two unsupervised algorithms for morphological segmentation — Morfessor and StateMorph. We train the models for several languages — including ones with very rich morphology — and compare their performance with different segmentation algorithms, vocabulary sizes, and model sizes. The results show that training with morphological segmentation allows the LMs to: (1) achieve lower perplexity, (2) converge more efficiently in terms of training time, and (3) achieve equivalent or better evaluation scores on downstream tasks. Lastly, we show that (4) LMs of smaller size using morphological segmentation can perform comparably to models of larger size trained with BPE — both in terms of (1) perplexity and (3) scores on downstream tasks. Points (2) and (4) impact on sustainability, since they reduce the model cost; and while 2 reduces cost only in the training phase, 4 does so also in the inference phase.
%R 10.18653/v1/2023.emnlp-main.459
%U https://aclanthology.org/2023.emnlp-main.459/
%U https://doi.org/10.18653/v1/2023.emnlp-main.459
%P 7413-7425
Markdown (Informal)
[Effects of sub-word segmentation on performance of transformer language models](https://aclanthology.org/2023.emnlp-main.459/) (Hou et al., EMNLP 2023)
ACL