@inproceedings{lofgren-dannells-2024-post,
title = "Post-{OCR} Correction of Digitized {S}wedish Newspapers with {B}y{T}5",
author = {L{\"o}fgren, Viktoria and
Dann{\'e}lls, Dana},
editor = "Bizzoni, Yuri and
Degaetano-Ortlieb, Stefania and
Kazantseva, Anna and
Szpakowicz, Stan",
booktitle = "Proceedings of the 8th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature (LaTeCH-CLfL 2024)",
month = mar,
year = "2024",
address = "St. Julians, Malta",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.latechclfl-1.23",
pages = "237--242",
abstract = "Many collections of digitized newspapers suffer from poor OCR quality, which impacts readability, information retrieval, and analysis of the material. Errors in OCR output can be reduced by applying machine translation models to {``}translate{''} it into a corrected version. Although transformer models show promising results in post-OCR correction and related tasks in other languages, they have not yet been explored for correcting OCR errors in Swedish texts. This paper presents a post-OCR correction model for Swedish 19th to 21th century newspapers based on the pre-trained transformer model ByT5. Three versions of the model were trained on different mixes of training data. The best model, which achieved a 36{\%} reduction in CER, is made freely available and will be integrated into the automatic processing pipeline of Sprakbanken Text, a Swedish language technology infrastructure containing modern and historical written data.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lofgren-dannells-2024-post">
<titleInfo>
<title>Post-OCR Correction of Digitized Swedish Newspapers with ByT5</title>
</titleInfo>
<name type="personal">
<namePart type="given">Viktoria</namePart>
<namePart type="family">Löfgren</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dana</namePart>
<namePart type="family">Dannélls</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 8th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature (LaTeCH-CLfL 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yuri</namePart>
<namePart type="family">Bizzoni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stefania</namePart>
<namePart type="family">Degaetano-Ortlieb</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Kazantseva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stan</namePart>
<namePart type="family">Szpakowicz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">St. Julians, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Many collections of digitized newspapers suffer from poor OCR quality, which impacts readability, information retrieval, and analysis of the material. Errors in OCR output can be reduced by applying machine translation models to “translate” it into a corrected version. Although transformer models show promising results in post-OCR correction and related tasks in other languages, they have not yet been explored for correcting OCR errors in Swedish texts. This paper presents a post-OCR correction model for Swedish 19th to 21th century newspapers based on the pre-trained transformer model ByT5. Three versions of the model were trained on different mixes of training data. The best model, which achieved a 36% reduction in CER, is made freely available and will be integrated into the automatic processing pipeline of Sprakbanken Text, a Swedish language technology infrastructure containing modern and historical written data.</abstract>
<identifier type="citekey">lofgren-dannells-2024-post</identifier>
<location>
<url>https://aclanthology.org/2024.latechclfl-1.23</url>
</location>
<part>
<date>2024-03</date>
<extent unit="page">
<start>237</start>
<end>242</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Post-OCR Correction of Digitized Swedish Newspapers with ByT5
%A Löfgren, Viktoria
%A Dannélls, Dana
%Y Bizzoni, Yuri
%Y Degaetano-Ortlieb, Stefania
%Y Kazantseva, Anna
%Y Szpakowicz, Stan
%S Proceedings of the 8th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature (LaTeCH-CLfL 2024)
%D 2024
%8 March
%I Association for Computational Linguistics
%C St. Julians, Malta
%F lofgren-dannells-2024-post
%X Many collections of digitized newspapers suffer from poor OCR quality, which impacts readability, information retrieval, and analysis of the material. Errors in OCR output can be reduced by applying machine translation models to “translate” it into a corrected version. Although transformer models show promising results in post-OCR correction and related tasks in other languages, they have not yet been explored for correcting OCR errors in Swedish texts. This paper presents a post-OCR correction model for Swedish 19th to 21th century newspapers based on the pre-trained transformer model ByT5. Three versions of the model were trained on different mixes of training data. The best model, which achieved a 36% reduction in CER, is made freely available and will be integrated into the automatic processing pipeline of Sprakbanken Text, a Swedish language technology infrastructure containing modern and historical written data.
%U https://aclanthology.org/2024.latechclfl-1.23
%P 237-242
Markdown (Informal)
[Post-OCR Correction of Digitized Swedish Newspapers with ByT5](https://aclanthology.org/2024.latechclfl-1.23) (Löfgren & Dannélls, LaTeCHCLfL-WS 2024)
ACL
- Viktoria Löfgren and Dana Dannélls. 2024. Post-OCR Correction of Digitized Swedish Newspapers with ByT5. In Proceedings of the 8th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature (LaTeCH-CLfL 2024), pages 237–242, St. Julians, Malta. Association for Computational Linguistics.