@inproceedings{virk-etal-2021-novel,
title = "A Novel Machine Learning Based Approach for Post-{OCR} Error Detection",
author = "Virk, Shafqat Mumtaz and
Dann{\'e}lls, Dana and
Sheikh Muhammad, Azam",
editor = "Mitkov, Ruslan and
Angelova, Galia",
booktitle = "Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021)",
month = sep,
year = "2021",
address = "Held Online",
publisher = "INCOMA Ltd.",
url = "https://aclanthology.org/2021.ranlp-1.164",
pages = "1463--1470",
abstract = "Post processing is the most conventional approach for correcting errors that are caused by Optical Character Recognition(OCR) systems. Two steps are usually taken to correct OCR errors: detection and corrections. For the first task, supervised machine learning methods have shown state-of-the-art performances. Previously proposed approaches have focused most prominently on combining lexical, contextual and statistical features for detecting errors. In this study, we report a novel system to error detection which is based merely on the n-gram counts of a candidate token. In addition to being simple and computationally less expensive, our proposed system beats previous systems reported in the ICDAR2019 competition on OCR-error detection with notable margins. We achieved state-of-the-art F1-scores for eight out of the ten involved European languages. The maximum improvement is for Spanish which improved from 0.69 to 0.90, and the minimum for Polish from 0.82 to 0.84.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="virk-etal-2021-novel">
<titleInfo>
<title>A Novel Machine Learning Based Approach for Post-OCR Error Detection</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shafqat</namePart>
<namePart type="given">Mumtaz</namePart>
<namePart type="family">Virk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dana</namePart>
<namePart type="family">Dannélls</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Azam</namePart>
<namePart type="family">Sheikh Muhammad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd.</publisher>
<place>
<placeTerm type="text">Held Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Post processing is the most conventional approach for correcting errors that are caused by Optical Character Recognition(OCR) systems. Two steps are usually taken to correct OCR errors: detection and corrections. For the first task, supervised machine learning methods have shown state-of-the-art performances. Previously proposed approaches have focused most prominently on combining lexical, contextual and statistical features for detecting errors. In this study, we report a novel system to error detection which is based merely on the n-gram counts of a candidate token. In addition to being simple and computationally less expensive, our proposed system beats previous systems reported in the ICDAR2019 competition on OCR-error detection with notable margins. We achieved state-of-the-art F1-scores for eight out of the ten involved European languages. The maximum improvement is for Spanish which improved from 0.69 to 0.90, and the minimum for Polish from 0.82 to 0.84.</abstract>
<identifier type="citekey">virk-etal-2021-novel</identifier>
<location>
<url>https://aclanthology.org/2021.ranlp-1.164</url>
</location>
<part>
<date>2021-09</date>
<extent unit="page">
<start>1463</start>
<end>1470</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Novel Machine Learning Based Approach for Post-OCR Error Detection
%A Virk, Shafqat Mumtaz
%A Dannélls, Dana
%A Sheikh Muhammad, Azam
%Y Mitkov, Ruslan
%Y Angelova, Galia
%S Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021)
%D 2021
%8 September
%I INCOMA Ltd.
%C Held Online
%F virk-etal-2021-novel
%X Post processing is the most conventional approach for correcting errors that are caused by Optical Character Recognition(OCR) systems. Two steps are usually taken to correct OCR errors: detection and corrections. For the first task, supervised machine learning methods have shown state-of-the-art performances. Previously proposed approaches have focused most prominently on combining lexical, contextual and statistical features for detecting errors. In this study, we report a novel system to error detection which is based merely on the n-gram counts of a candidate token. In addition to being simple and computationally less expensive, our proposed system beats previous systems reported in the ICDAR2019 competition on OCR-error detection with notable margins. We achieved state-of-the-art F1-scores for eight out of the ten involved European languages. The maximum improvement is for Spanish which improved from 0.69 to 0.90, and the minimum for Polish from 0.82 to 0.84.
%U https://aclanthology.org/2021.ranlp-1.164
%P 1463-1470
Markdown (Informal)
[A Novel Machine Learning Based Approach for Post-OCR Error Detection](https://aclanthology.org/2021.ranlp-1.164) (Virk et al., RANLP 2021)
ACL