@article{rijhwani-etal-2021-lexically,
title = "Lexically Aware Semi-Supervised Learning for {OCR} Post-Correction",
author = "Rijhwani, Shruti and
Rosenblum, Daisy and
Anastasopoulos, Antonios and
Neubig, Graham",
editor = "Roark, Brian and
Nenkova, Ani",
journal = "Transactions of the Association for Computational Linguistics",
volume = "9",
year = "2021",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2021.tacl-1.76/",
doi = "10.1162/tacl_a_00427",
pages = "1285--1302",
abstract = "Much of the existing linguistic data in many languages of the world is locked away in non- digitized books and documents. Optical character recognition (OCR) can be used to produce digitized text, and previous work has demonstrated the utility of neural post-correction methods that improve the results of general- purpose OCR systems on recognition of less- well-resourced languages. However, these methods rely on manually curated post- correction data, which are relatively scarce compared to the non-annotated raw images that need to be digitized. In this paper, we present a semi-supervised learning method that makes it possible to utilize these raw images to improve performance, specifically through the use of self-training, a technique where a model is iteratively trained on its own outputs. In addition, to enforce consistency in the recognized vocabulary, we introduce a lexically aware decoding method that augments the neural post-correction model with a count-based language model constructed from the recognized texts, implemented using weighted finite-state automata (WFSA) for efficient and effective decoding. Results on four endangered languages demonstrate the utility of the proposed method, with relative error reductions of 15{\%}{--}29{\%}, where we find the combination of self-training and lexically aware decoding essential for achieving consistent improvements.1"
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="rijhwani-etal-2021-lexically">
<titleInfo>
<title>Lexically Aware Semi-Supervised Learning for OCR Post-Correction</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shruti</namePart>
<namePart type="family">Rijhwani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daisy</namePart>
<namePart type="family">Rosenblum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonios</namePart>
<namePart type="family">Anastasopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Graham</namePart>
<namePart type="family">Neubig</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Much of the existing linguistic data in many languages of the world is locked away in non- digitized books and documents. Optical character recognition (OCR) can be used to produce digitized text, and previous work has demonstrated the utility of neural post-correction methods that improve the results of general- purpose OCR systems on recognition of less- well-resourced languages. However, these methods rely on manually curated post- correction data, which are relatively scarce compared to the non-annotated raw images that need to be digitized. In this paper, we present a semi-supervised learning method that makes it possible to utilize these raw images to improve performance, specifically through the use of self-training, a technique where a model is iteratively trained on its own outputs. In addition, to enforce consistency in the recognized vocabulary, we introduce a lexically aware decoding method that augments the neural post-correction model with a count-based language model constructed from the recognized texts, implemented using weighted finite-state automata (WFSA) for efficient and effective decoding. Results on four endangered languages demonstrate the utility of the proposed method, with relative error reductions of 15%–29%, where we find the combination of self-training and lexically aware decoding essential for achieving consistent improvements.1</abstract>
<identifier type="citekey">rijhwani-etal-2021-lexically</identifier>
<identifier type="doi">10.1162/tacl_a_00427</identifier>
<location>
<url>https://aclanthology.org/2021.tacl-1.76/</url>
</location>
<part>
<date>2021</date>
<detail type="volume"><number>9</number></detail>
<extent unit="page">
<start>1285</start>
<end>1302</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Lexically Aware Semi-Supervised Learning for OCR Post-Correction
%A Rijhwani, Shruti
%A Rosenblum, Daisy
%A Anastasopoulos, Antonios
%A Neubig, Graham
%J Transactions of the Association for Computational Linguistics
%D 2021
%V 9
%I MIT Press
%C Cambridge, MA
%F rijhwani-etal-2021-lexically
%X Much of the existing linguistic data in many languages of the world is locked away in non- digitized books and documents. Optical character recognition (OCR) can be used to produce digitized text, and previous work has demonstrated the utility of neural post-correction methods that improve the results of general- purpose OCR systems on recognition of less- well-resourced languages. However, these methods rely on manually curated post- correction data, which are relatively scarce compared to the non-annotated raw images that need to be digitized. In this paper, we present a semi-supervised learning method that makes it possible to utilize these raw images to improve performance, specifically through the use of self-training, a technique where a model is iteratively trained on its own outputs. In addition, to enforce consistency in the recognized vocabulary, we introduce a lexically aware decoding method that augments the neural post-correction model with a count-based language model constructed from the recognized texts, implemented using weighted finite-state automata (WFSA) for efficient and effective decoding. Results on four endangered languages demonstrate the utility of the proposed method, with relative error reductions of 15%–29%, where we find the combination of self-training and lexically aware decoding essential for achieving consistent improvements.1
%R 10.1162/tacl_a_00427
%U https://aclanthology.org/2021.tacl-1.76/
%U https://doi.org/10.1162/tacl_a_00427
%P 1285-1302
Markdown (Informal)
[Lexically Aware Semi-Supervised Learning for OCR Post-Correction](https://aclanthology.org/2021.tacl-1.76/) (Rijhwani et al., TACL 2021)
ACL