@inproceedings{beinborn-pinter-2023-analyzing,
title = "Analyzing Cognitive Plausibility of Subword Tokenization",
author = "Beinborn, Lisa and
Pinter, Yuval",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-main.272/",
doi = "10.18653/v1/2023.emnlp-main.272",
pages = "4478--4486",
abstract = "Subword tokenization has become the de-facto standard for tokenization although comparative evaluations of their quality across languages are scarce. Existing evaluation studies focus on the effect of a tokenization algorithm on the performance in downstream tasks, or on engineering criteria such as the compression rate. We present a new evaluation paradigm that focuses on the cognitive plausibility of subword tokenization. We analyze the correlation of the tokenizer output with the reading time and accuracy of human responses on a lexical decision task. We compare three tokenization algorithms across several languages and vocabulary sizes. Our results indicate that the Unigram algorithm yields less cognitively plausible tokenization behavior and a worse coverage of derivational morphemes, in contrast with prior work."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="beinborn-pinter-2023-analyzing">
<titleInfo>
<title>Analyzing Cognitive Plausibility of Subword Tokenization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lisa</namePart>
<namePart type="family">Beinborn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuval</namePart>
<namePart type="family">Pinter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Subword tokenization has become the de-facto standard for tokenization although comparative evaluations of their quality across languages are scarce. Existing evaluation studies focus on the effect of a tokenization algorithm on the performance in downstream tasks, or on engineering criteria such as the compression rate. We present a new evaluation paradigm that focuses on the cognitive plausibility of subword tokenization. We analyze the correlation of the tokenizer output with the reading time and accuracy of human responses on a lexical decision task. We compare three tokenization algorithms across several languages and vocabulary sizes. Our results indicate that the Unigram algorithm yields less cognitively plausible tokenization behavior and a worse coverage of derivational morphemes, in contrast with prior work.</abstract>
<identifier type="citekey">beinborn-pinter-2023-analyzing</identifier>
<identifier type="doi">10.18653/v1/2023.emnlp-main.272</identifier>
<location>
<url>https://aclanthology.org/2023.emnlp-main.272/</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>4478</start>
<end>4486</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Analyzing Cognitive Plausibility of Subword Tokenization
%A Beinborn, Lisa
%A Pinter, Yuval
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F beinborn-pinter-2023-analyzing
%X Subword tokenization has become the de-facto standard for tokenization although comparative evaluations of their quality across languages are scarce. Existing evaluation studies focus on the effect of a tokenization algorithm on the performance in downstream tasks, or on engineering criteria such as the compression rate. We present a new evaluation paradigm that focuses on the cognitive plausibility of subword tokenization. We analyze the correlation of the tokenizer output with the reading time and accuracy of human responses on a lexical decision task. We compare three tokenization algorithms across several languages and vocabulary sizes. Our results indicate that the Unigram algorithm yields less cognitively plausible tokenization behavior and a worse coverage of derivational morphemes, in contrast with prior work.
%R 10.18653/v1/2023.emnlp-main.272
%U https://aclanthology.org/2023.emnlp-main.272/
%U https://doi.org/10.18653/v1/2023.emnlp-main.272
%P 4478-4486
Markdown (Informal)
[Analyzing Cognitive Plausibility of Subword Tokenization](https://aclanthology.org/2023.emnlp-main.272/) (Beinborn & Pinter, EMNLP 2023)
ACL