@inproceedings{xiang-etal-2022-cantonese,
title = "When {C}antonese {NLP} Meets Pre-training: Progress and Challenges",
author = "Xiang, Rong and
Tan, Hanzhuo and
Li, Jing and
Wan, Mingyu and
Wong, Kam-Fai",
editor = "Alonso, Miguel A. and
Wei, Zhongyu",
booktitle = "Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing: Tutorial Abstracts",
month = nov,
year = "2022",
address = "Taipei",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.aacl-tutorials.3/",
doi = "10.18653/v1/2022.aacl-tutorials.3",
pages = "16--21",
abstract = "Cantonese is an influential Chinese variant with a large population of speakers worldwide. However, it is under-resourced in terms of the data scale and diversity, excluding Cantonese Natural Language Processing (NLP) from the stateof-the-art (SOTA) {\textquotedblleft}pre-training and fine-tuning{\textquotedblright} paradigm. This tutorial will start with a substantially review of the linguistics and NLP progress for shaping language specificity, resources, and methodologies. It will be followed by an introduction to the trendy transformerbased pre-training methods, which have been largely advancing the SOTA performance of a wide range of downstream NLP tasks in numerous majority languages (e.g., English and Chinese). Based on the above, we will present the main challenges for Cantonese NLP in relation to Cantonese language idiosyncrasies of colloquialism and multilingualism, followed by the future directions to line NLP for Cantonese and other low-resource languages up to the cutting-edge pre-training practice."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="xiang-etal-2022-cantonese">
<titleInfo>
<title>When Cantonese NLP Meets Pre-training: Progress and Challenges</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rong</namePart>
<namePart type="family">Xiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hanzhuo</namePart>
<namePart type="family">Tan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mingyu</namePart>
<namePart type="family">Wan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kam-Fai</namePart>
<namePart type="family">Wong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing: Tutorial Abstracts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Miguel</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Alonso</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhongyu</namePart>
<namePart type="family">Wei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Taipei</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Cantonese is an influential Chinese variant with a large population of speakers worldwide. However, it is under-resourced in terms of the data scale and diversity, excluding Cantonese Natural Language Processing (NLP) from the stateof-the-art (SOTA) “pre-training and fine-tuning” paradigm. This tutorial will start with a substantially review of the linguistics and NLP progress for shaping language specificity, resources, and methodologies. It will be followed by an introduction to the trendy transformerbased pre-training methods, which have been largely advancing the SOTA performance of a wide range of downstream NLP tasks in numerous majority languages (e.g., English and Chinese). Based on the above, we will present the main challenges for Cantonese NLP in relation to Cantonese language idiosyncrasies of colloquialism and multilingualism, followed by the future directions to line NLP for Cantonese and other low-resource languages up to the cutting-edge pre-training practice.</abstract>
<identifier type="citekey">xiang-etal-2022-cantonese</identifier>
<identifier type="doi">10.18653/v1/2022.aacl-tutorials.3</identifier>
<location>
<url>https://aclanthology.org/2022.aacl-tutorials.3/</url>
</location>
<part>
<date>2022-11</date>
<extent unit="page">
<start>16</start>
<end>21</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T When Cantonese NLP Meets Pre-training: Progress and Challenges
%A Xiang, Rong
%A Tan, Hanzhuo
%A Li, Jing
%A Wan, Mingyu
%A Wong, Kam-Fai
%Y Alonso, Miguel A.
%Y Wei, Zhongyu
%S Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing: Tutorial Abstracts
%D 2022
%8 November
%I Association for Computational Linguistics
%C Taipei
%F xiang-etal-2022-cantonese
%X Cantonese is an influential Chinese variant with a large population of speakers worldwide. However, it is under-resourced in terms of the data scale and diversity, excluding Cantonese Natural Language Processing (NLP) from the stateof-the-art (SOTA) “pre-training and fine-tuning” paradigm. This tutorial will start with a substantially review of the linguistics and NLP progress for shaping language specificity, resources, and methodologies. It will be followed by an introduction to the trendy transformerbased pre-training methods, which have been largely advancing the SOTA performance of a wide range of downstream NLP tasks in numerous majority languages (e.g., English and Chinese). Based on the above, we will present the main challenges for Cantonese NLP in relation to Cantonese language idiosyncrasies of colloquialism and multilingualism, followed by the future directions to line NLP for Cantonese and other low-resource languages up to the cutting-edge pre-training practice.
%R 10.18653/v1/2022.aacl-tutorials.3
%U https://aclanthology.org/2022.aacl-tutorials.3/
%U https://doi.org/10.18653/v1/2022.aacl-tutorials.3
%P 16-21
Markdown (Informal)
[When Cantonese NLP Meets Pre-training: Progress and Challenges](https://aclanthology.org/2022.aacl-tutorials.3/) (Xiang et al., AACL-IJCNLP 2022)
ACL
- Rong Xiang, Hanzhuo Tan, Jing Li, Mingyu Wan, and Kam-Fai Wong. 2022. When Cantonese NLP Meets Pre-training: Progress and Challenges. In Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing: Tutorial Abstracts, pages 16–21, Taipei. Association for Computational Linguistics.