@inproceedings{imamura-utiyama-2024-empirical,
title = "An Empirical Study of Multilingual Vocabulary for Neural Machine Translation Models",
author = "Imamura, Kenji and
Utiyama, Masao",
editor = "Nakazawa, Toshiaki and
Goto, Isao",
booktitle = "Proceedings of the Eleventh Workshop on Asian Translation (WAT 2024)",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.wat-1.2/",
doi = "10.18653/v1/2024.wat-1.2",
pages = "22--35",
abstract = "In this paper, we discuss multilingual vocabulary for neural machine translation models. Multilingual vocabularies should generate highly accurate machine translations regardless of the languages, and have preferences so that tokenized strings contain rare out-of-vocabulary (OOV) tokens and token sequences are short. In this paper, we discuss the characteristics of various multilingual vocabularies via tokenization and translation experiments. We also present our recommended vocabulary and tokenizer."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="imamura-utiyama-2024-empirical">
<titleInfo>
<title>An Empirical Study of Multilingual Vocabulary for Neural Machine Translation Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kenji</namePart>
<namePart type="family">Imamura</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Masao</namePart>
<namePart type="family">Utiyama</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Eleventh Workshop on Asian Translation (WAT 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Toshiaki</namePart>
<namePart type="family">Nakazawa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isao</namePart>
<namePart type="family">Goto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper, we discuss multilingual vocabulary for neural machine translation models. Multilingual vocabularies should generate highly accurate machine translations regardless of the languages, and have preferences so that tokenized strings contain rare out-of-vocabulary (OOV) tokens and token sequences are short. In this paper, we discuss the characteristics of various multilingual vocabularies via tokenization and translation experiments. We also present our recommended vocabulary and tokenizer.</abstract>
<identifier type="citekey">imamura-utiyama-2024-empirical</identifier>
<identifier type="doi">10.18653/v1/2024.wat-1.2</identifier>
<location>
<url>https://aclanthology.org/2024.wat-1.2/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>22</start>
<end>35</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T An Empirical Study of Multilingual Vocabulary for Neural Machine Translation Models
%A Imamura, Kenji
%A Utiyama, Masao
%Y Nakazawa, Toshiaki
%Y Goto, Isao
%S Proceedings of the Eleventh Workshop on Asian Translation (WAT 2024)
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F imamura-utiyama-2024-empirical
%X In this paper, we discuss multilingual vocabulary for neural machine translation models. Multilingual vocabularies should generate highly accurate machine translations regardless of the languages, and have preferences so that tokenized strings contain rare out-of-vocabulary (OOV) tokens and token sequences are short. In this paper, we discuss the characteristics of various multilingual vocabularies via tokenization and translation experiments. We also present our recommended vocabulary and tokenizer.
%R 10.18653/v1/2024.wat-1.2
%U https://aclanthology.org/2024.wat-1.2/
%U https://doi.org/10.18653/v1/2024.wat-1.2
%P 22-35
Markdown (Informal)
[An Empirical Study of Multilingual Vocabulary for Neural Machine Translation Models](https://aclanthology.org/2024.wat-1.2/) (Imamura & Utiyama, WAT 2024)
ACL