@inproceedings{lam-etal-2024-multi,
title = "Multi-Tiered {C}antonese Word Segmentation",
author = "Lam, Charles and
Lau, Chaak-ming and
Lee, Jackson L.",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.1047/",
pages = "11993--12002",
abstract = "Word segmentation for Chinese text data is essential for compiling corpora and any other tasks where the notion of {\textquotedblleft}word{\textquotedblright} is assumed, since Chinese orthography does not have conventional word boundaries as languages such as English do. A perennial issue, however, is that there is no consensus about the definition of {\textquotedblleft}word{\textquotedblright} in Chinese, which makes word segmentation challenging. Recent work in Chinese word segmentation has begun to embrace the idea of multiple word segmentation possibilities. In a similar spirit, this paper focuses on Cantonese, another major Chinese variety. We propose a linguistically motivated, multi-tiered word segmentation system for Cantonese, and release a Cantonese corpus of 150,000 characters word-segmented by this proposal. Our work will be of interest to researchers whose work involves Cantonese corpus data."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lam-etal-2024-multi">
<titleInfo>
<title>Multi-Tiered Cantonese Word Segmentation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Charles</namePart>
<namePart type="family">Lam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chaak-ming</namePart>
<namePart type="family">Lau</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jackson</namePart>
<namePart type="given">L</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Word segmentation for Chinese text data is essential for compiling corpora and any other tasks where the notion of “word” is assumed, since Chinese orthography does not have conventional word boundaries as languages such as English do. A perennial issue, however, is that there is no consensus about the definition of “word” in Chinese, which makes word segmentation challenging. Recent work in Chinese word segmentation has begun to embrace the idea of multiple word segmentation possibilities. In a similar spirit, this paper focuses on Cantonese, another major Chinese variety. We propose a linguistically motivated, multi-tiered word segmentation system for Cantonese, and release a Cantonese corpus of 150,000 characters word-segmented by this proposal. Our work will be of interest to researchers whose work involves Cantonese corpus data.</abstract>
<identifier type="citekey">lam-etal-2024-multi</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.1047/</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>11993</start>
<end>12002</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multi-Tiered Cantonese Word Segmentation
%A Lam, Charles
%A Lau, Chaak-ming
%A Lee, Jackson L.
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F lam-etal-2024-multi
%X Word segmentation for Chinese text data is essential for compiling corpora and any other tasks where the notion of “word” is assumed, since Chinese orthography does not have conventional word boundaries as languages such as English do. A perennial issue, however, is that there is no consensus about the definition of “word” in Chinese, which makes word segmentation challenging. Recent work in Chinese word segmentation has begun to embrace the idea of multiple word segmentation possibilities. In a similar spirit, this paper focuses on Cantonese, another major Chinese variety. We propose a linguistically motivated, multi-tiered word segmentation system for Cantonese, and release a Cantonese corpus of 150,000 characters word-segmented by this proposal. Our work will be of interest to researchers whose work involves Cantonese corpus data.
%U https://aclanthology.org/2024.lrec-main.1047/
%P 11993-12002
Markdown (Informal)
[Multi-Tiered Cantonese Word Segmentation](https://aclanthology.org/2024.lrec-main.1047/) (Lam et al., LREC-COLING 2024)
ACL
- Charles Lam, Chaak-ming Lau, and Jackson L. Lee. 2024. Multi-Tiered Cantonese Word Segmentation. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), pages 11993–12002, Torino, Italia. ELRA and ICCL.