@inproceedings{li-etal-2022-wspeller,
title = "{WS}peller: Robust Word Segmentation for Enhancing {C}hinese Spelling Check",
author = "Li, Fangfang and
Shan, Youran and
Duan, Junwen and
Mao, Xingliang and
Huang, Minlie",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2022",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-emnlp.84/",
doi = "10.18653/v1/2022.findings-emnlp.84",
pages = "1179--1188",
abstract = "Chinese spelling check (CSC) detects and corrects spelling errors in Chinese texts. Previous approaches have combined character-level phonetic and graphic information, ignoring the importance of segment-level information. According to our pilot study, spelling errors are always associated with incorrect word segmentation. When appropriate word boundaries are provided, CSC performance is greatly enhanced. Based on these findings, we present WSpeller, a CSC model that takes into account word segmentation. A fundamental component of WSpeller is a W-MLM, which is trained by predicting visually and phonetically similar words. Through modification of the embedding layer`s input, word segmentation information can be incorporated. Additionally, a robust module is trained to assist the W-MLM-based correction module by predicting the correct word segmentations from sentences containing spelling errors. We evaluate WSpeller on the widely used benchmark datasets SIGHAN13, SIGHAN14, and SIGHAN15. Our model is superior to state-of-the-art baselines on SIGHAN13 and SIGHAN15 and maintains equal performance on SIGHAN14."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="li-etal-2022-wspeller">
<titleInfo>
<title>WSpeller: Robust Word Segmentation for Enhancing Chinese Spelling Check</title>
</titleInfo>
<name type="personal">
<namePart type="given">Fangfang</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Youran</namePart>
<namePart type="family">Shan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junwen</namePart>
<namePart type="family">Duan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xingliang</namePart>
<namePart type="family">Mao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Minlie</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Chinese spelling check (CSC) detects and corrects spelling errors in Chinese texts. Previous approaches have combined character-level phonetic and graphic information, ignoring the importance of segment-level information. According to our pilot study, spelling errors are always associated with incorrect word segmentation. When appropriate word boundaries are provided, CSC performance is greatly enhanced. Based on these findings, we present WSpeller, a CSC model that takes into account word segmentation. A fundamental component of WSpeller is a W-MLM, which is trained by predicting visually and phonetically similar words. Through modification of the embedding layer‘s input, word segmentation information can be incorporated. Additionally, a robust module is trained to assist the W-MLM-based correction module by predicting the correct word segmentations from sentences containing spelling errors. We evaluate WSpeller on the widely used benchmark datasets SIGHAN13, SIGHAN14, and SIGHAN15. Our model is superior to state-of-the-art baselines on SIGHAN13 and SIGHAN15 and maintains equal performance on SIGHAN14.</abstract>
<identifier type="citekey">li-etal-2022-wspeller</identifier>
<identifier type="doi">10.18653/v1/2022.findings-emnlp.84</identifier>
<location>
<url>https://aclanthology.org/2022.findings-emnlp.84/</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>1179</start>
<end>1188</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T WSpeller: Robust Word Segmentation for Enhancing Chinese Spelling Check
%A Li, Fangfang
%A Shan, Youran
%A Duan, Junwen
%A Mao, Xingliang
%A Huang, Minlie
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Findings of the Association for Computational Linguistics: EMNLP 2022
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F li-etal-2022-wspeller
%X Chinese spelling check (CSC) detects and corrects spelling errors in Chinese texts. Previous approaches have combined character-level phonetic and graphic information, ignoring the importance of segment-level information. According to our pilot study, spelling errors are always associated with incorrect word segmentation. When appropriate word boundaries are provided, CSC performance is greatly enhanced. Based on these findings, we present WSpeller, a CSC model that takes into account word segmentation. A fundamental component of WSpeller is a W-MLM, which is trained by predicting visually and phonetically similar words. Through modification of the embedding layer‘s input, word segmentation information can be incorporated. Additionally, a robust module is trained to assist the W-MLM-based correction module by predicting the correct word segmentations from sentences containing spelling errors. We evaluate WSpeller on the widely used benchmark datasets SIGHAN13, SIGHAN14, and SIGHAN15. Our model is superior to state-of-the-art baselines on SIGHAN13 and SIGHAN15 and maintains equal performance on SIGHAN14.
%R 10.18653/v1/2022.findings-emnlp.84
%U https://aclanthology.org/2022.findings-emnlp.84/
%U https://doi.org/10.18653/v1/2022.findings-emnlp.84
%P 1179-1188
Markdown (Informal)
[WSpeller: Robust Word Segmentation for Enhancing Chinese Spelling Check](https://aclanthology.org/2022.findings-emnlp.84/) (Li et al., Findings 2022)
ACL