@inproceedings{bao-etal-2020-chunk,
title = "Chunk-based {C}hinese Spelling Check with Global Optimization",
author = "Bao, Zuyi and
Li, Chen and
Wang, Rui",
editor = "Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2020",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.findings-emnlp.184",
doi = "10.18653/v1/2020.findings-emnlp.184",
pages = "2031--2040",
abstract = "Chinese spelling check is a challenging task due to the characteristics of the Chinese language, such as the large character set, no word boundary, and short word length. On the one hand, most of the previous works only consider corrections with similar character pronunciation or shape, failing to correct visually and phonologically irrelevant typos. On the other hand, pipeline-style architectures are widely adopted to deal with different types of spelling errors in individual modules, which is difficult to optimize. In order to handle these issues, in this work, 1) we extend the traditional confusion sets with semantical candidates to cover different types of errors; 2) we propose a chunk-based framework to correct single-character and multi-character word errors uniformly; and 3) we adopt a global optimization strategy to enable a sentence-level correction selection. The experimental results show that the proposed approach achieves a new state-of-the-art performance on three benchmark datasets, as well as an optical character recognition dataset.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bao-etal-2020-chunk">
<titleInfo>
<title>Chunk-based Chinese Spelling Check with Global Optimization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zuyi</namePart>
<namePart type="family">Bao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chen</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rui</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2020</title>
</titleInfo>
<name type="personal">
<namePart type="given">Trevor</namePart>
<namePart type="family">Cohn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Chinese spelling check is a challenging task due to the characteristics of the Chinese language, such as the large character set, no word boundary, and short word length. On the one hand, most of the previous works only consider corrections with similar character pronunciation or shape, failing to correct visually and phonologically irrelevant typos. On the other hand, pipeline-style architectures are widely adopted to deal with different types of spelling errors in individual modules, which is difficult to optimize. In order to handle these issues, in this work, 1) we extend the traditional confusion sets with semantical candidates to cover different types of errors; 2) we propose a chunk-based framework to correct single-character and multi-character word errors uniformly; and 3) we adopt a global optimization strategy to enable a sentence-level correction selection. The experimental results show that the proposed approach achieves a new state-of-the-art performance on three benchmark datasets, as well as an optical character recognition dataset.</abstract>
<identifier type="citekey">bao-etal-2020-chunk</identifier>
<identifier type="doi">10.18653/v1/2020.findings-emnlp.184</identifier>
<location>
<url>https://aclanthology.org/2020.findings-emnlp.184</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>2031</start>
<end>2040</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Chunk-based Chinese Spelling Check with Global Optimization
%A Bao, Zuyi
%A Li, Chen
%A Wang, Rui
%Y Cohn, Trevor
%Y He, Yulan
%Y Liu, Yang
%S Findings of the Association for Computational Linguistics: EMNLP 2020
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F bao-etal-2020-chunk
%X Chinese spelling check is a challenging task due to the characteristics of the Chinese language, such as the large character set, no word boundary, and short word length. On the one hand, most of the previous works only consider corrections with similar character pronunciation or shape, failing to correct visually and phonologically irrelevant typos. On the other hand, pipeline-style architectures are widely adopted to deal with different types of spelling errors in individual modules, which is difficult to optimize. In order to handle these issues, in this work, 1) we extend the traditional confusion sets with semantical candidates to cover different types of errors; 2) we propose a chunk-based framework to correct single-character and multi-character word errors uniformly; and 3) we adopt a global optimization strategy to enable a sentence-level correction selection. The experimental results show that the proposed approach achieves a new state-of-the-art performance on three benchmark datasets, as well as an optical character recognition dataset.
%R 10.18653/v1/2020.findings-emnlp.184
%U https://aclanthology.org/2020.findings-emnlp.184
%U https://doi.org/10.18653/v1/2020.findings-emnlp.184
%P 2031-2040
Markdown (Informal)
[Chunk-based Chinese Spelling Check with Global Optimization](https://aclanthology.org/2020.findings-emnlp.184) (Bao et al., Findings 2020)
ACL