@inproceedings{cognetta-etal-2023-parameter,
title = "Parameter-Efficient {K}orean Character-Level Language Modeling",
author = "Cognetta, Marco and
Moon, Sangwhan and
Wolf-sonkin, Lawrence and
Okazaki, Naoaki",
editor = "Vlachos, Andreas and
Augenstein, Isabelle",
booktitle = "Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.eacl-main.172",
doi = "10.18653/v1/2023.eacl-main.172",
pages = "2350--2356",
abstract = "Character-level language modeling has been shown empirically to perform well on highly agglutinative or morphologically rich languages while using only a small fraction of the parameters required by (sub)word models. Korean fits nicely into this framework, except that, like other CJK languages, it has a very large character vocabulary of 11,172 unique syllables. However, unlike Japanese Kanji and Chinese Hanzi, each Korean syllable can be uniquely factored into a small set of subcharacters, called jamo. We explore a {``}three-hot{''} scheme, where we exploit the decomposability of Korean characters to model at the syllable level but using only jamo-level representations. We find that our three-hot embedding and decoding scheme alleviates the two major issues with prior syllable- and jamo-level models. Namely, it requires fewer than 1{\%} of the embedding parameters of a syllable model, and it does not require tripling the sequence length, as with jamo models. In addition, it addresses a theoretical flaw in a prior three-hot modeling scheme. Our experiments show that, even when reducing the number of embedding parameters by 99.6{\%} (from 11.4M to just 36k), our model suffers no loss in translation quality compared to the baseline syllable model.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="cognetta-etal-2023-parameter">
<titleInfo>
<title>Parameter-Efficient Korean Character-Level Language Modeling</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marco</namePart>
<namePart type="family">Cognetta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sangwhan</namePart>
<namePart type="family">Moon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lawrence</namePart>
<namePart type="family">Wolf-sonkin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Vlachos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isabelle</namePart>
<namePart type="family">Augenstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dubrovnik, Croatia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Character-level language modeling has been shown empirically to perform well on highly agglutinative or morphologically rich languages while using only a small fraction of the parameters required by (sub)word models. Korean fits nicely into this framework, except that, like other CJK languages, it has a very large character vocabulary of 11,172 unique syllables. However, unlike Japanese Kanji and Chinese Hanzi, each Korean syllable can be uniquely factored into a small set of subcharacters, called jamo. We explore a “three-hot” scheme, where we exploit the decomposability of Korean characters to model at the syllable level but using only jamo-level representations. We find that our three-hot embedding and decoding scheme alleviates the two major issues with prior syllable- and jamo-level models. Namely, it requires fewer than 1% of the embedding parameters of a syllable model, and it does not require tripling the sequence length, as with jamo models. In addition, it addresses a theoretical flaw in a prior three-hot modeling scheme. Our experiments show that, even when reducing the number of embedding parameters by 99.6% (from 11.4M to just 36k), our model suffers no loss in translation quality compared to the baseline syllable model.</abstract>
<identifier type="citekey">cognetta-etal-2023-parameter</identifier>
<identifier type="doi">10.18653/v1/2023.eacl-main.172</identifier>
<location>
<url>https://aclanthology.org/2023.eacl-main.172</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>2350</start>
<end>2356</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Parameter-Efficient Korean Character-Level Language Modeling
%A Cognetta, Marco
%A Moon, Sangwhan
%A Wolf-sonkin, Lawrence
%A Okazaki, Naoaki
%Y Vlachos, Andreas
%Y Augenstein, Isabelle
%S Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics
%D 2023
%8 May
%I Association for Computational Linguistics
%C Dubrovnik, Croatia
%F cognetta-etal-2023-parameter
%X Character-level language modeling has been shown empirically to perform well on highly agglutinative or morphologically rich languages while using only a small fraction of the parameters required by (sub)word models. Korean fits nicely into this framework, except that, like other CJK languages, it has a very large character vocabulary of 11,172 unique syllables. However, unlike Japanese Kanji and Chinese Hanzi, each Korean syllable can be uniquely factored into a small set of subcharacters, called jamo. We explore a “three-hot” scheme, where we exploit the decomposability of Korean characters to model at the syllable level but using only jamo-level representations. We find that our three-hot embedding and decoding scheme alleviates the two major issues with prior syllable- and jamo-level models. Namely, it requires fewer than 1% of the embedding parameters of a syllable model, and it does not require tripling the sequence length, as with jamo models. In addition, it addresses a theoretical flaw in a prior three-hot modeling scheme. Our experiments show that, even when reducing the number of embedding parameters by 99.6% (from 11.4M to just 36k), our model suffers no loss in translation quality compared to the baseline syllable model.
%R 10.18653/v1/2023.eacl-main.172
%U https://aclanthology.org/2023.eacl-main.172
%U https://doi.org/10.18653/v1/2023.eacl-main.172
%P 2350-2356
Markdown (Informal)
[Parameter-Efficient Korean Character-Level Language Modeling](https://aclanthology.org/2023.eacl-main.172) (Cognetta et al., EACL 2023)
ACL
- Marco Cognetta, Sangwhan Moon, Lawrence Wolf-sonkin, and Naoaki Okazaki. 2023. Parameter-Efficient Korean Character-Level Language Modeling. In Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics, pages 2350–2356, Dubrovnik, Croatia. Association for Computational Linguistics.