@inproceedings{melli-etal-2020-gm,
title = "{GM}-{RKB} {W}iki{T}ext Error Correction Task and Baselines",
author = "Melli, Gabor and
Eldallal, Abdelrhman and
Lazem, Bassim and
Moreira, Olga",
editor = "Calzolari, Nicoletta and
B{\'e}chet, Fr{\'e}d{\'e}ric and
Blache, Philippe and
Choukri, Khalid and
Cieri, Christopher and
Declerck, Thierry and
Goggi, Sara and
Isahara, Hitoshi and
Maegaard, Bente and
Mariani, Joseph and
Mazo, H{\'e}l{\`e}ne and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Twelfth Language Resources and Evaluation Conference",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.lrec-1.295/",
pages = "2424--2430",
language = "eng",
ISBN = "979-10-95546-34-4",
abstract = "We introduce the GM-RKB WikiText Error Correction Task for the automatic detection and correction of typographical errors in WikiText annotated pages. The included corpus is based on a snapshot of the GM-RKB domain-specific semantic wiki consisting of a large collection of concepts, personages, and publications primary centered on data mining and machine learning research topics. Numerous Wikipedia pages were also included as additional training data in the task`s evaluation process. The corpus was then automatically updated to synthetically include realistic errors to produce a training and evaluation ground truth comparison. We designed and evaluated two supervised baseline WikiFixer error correction methods: (1) a naive approach based on a maximum likelihood character-level language model; (2) and an advanced model based on a sequence-to-sequence (seq2seq) neural network architecture. Both error correction models operated at a character level. When compared against an off-the-shelf word-level spell checker these methods showed a significant improvement in the task`s performance {--} with the seq2seq-based model correcting a higher number of errors than it introduced. Finally, we published our data and code."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="melli-etal-2020-gm">
<titleInfo>
<title>GM-RKB WikiText Error Correction Task and Baselines</title>
</titleInfo>
<name type="personal">
<namePart type="given">Gabor</namePart>
<namePart type="family">Melli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdelrhman</namePart>
<namePart type="family">Eldallal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bassim</namePart>
<namePart type="family">Lazem</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Olga</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Twelfth Language Resources and Evaluation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frédéric</namePart>
<namePart type="family">Béchet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philippe</namePart>
<namePart type="family">Blache</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Cieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Goggi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hitoshi</namePart>
<namePart type="family">Isahara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hélène</namePart>
<namePart type="family">Mazo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-34-4</identifier>
</relatedItem>
<abstract>We introduce the GM-RKB WikiText Error Correction Task for the automatic detection and correction of typographical errors in WikiText annotated pages. The included corpus is based on a snapshot of the GM-RKB domain-specific semantic wiki consisting of a large collection of concepts, personages, and publications primary centered on data mining and machine learning research topics. Numerous Wikipedia pages were also included as additional training data in the task‘s evaluation process. The corpus was then automatically updated to synthetically include realistic errors to produce a training and evaluation ground truth comparison. We designed and evaluated two supervised baseline WikiFixer error correction methods: (1) a naive approach based on a maximum likelihood character-level language model; (2) and an advanced model based on a sequence-to-sequence (seq2seq) neural network architecture. Both error correction models operated at a character level. When compared against an off-the-shelf word-level spell checker these methods showed a significant improvement in the task‘s performance – with the seq2seq-based model correcting a higher number of errors than it introduced. Finally, we published our data and code.</abstract>
<identifier type="citekey">melli-etal-2020-gm</identifier>
<location>
<url>https://aclanthology.org/2020.lrec-1.295/</url>
</location>
<part>
<date>2020-05</date>
<extent unit="page">
<start>2424</start>
<end>2430</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T GM-RKB WikiText Error Correction Task and Baselines
%A Melli, Gabor
%A Eldallal, Abdelrhman
%A Lazem, Bassim
%A Moreira, Olga
%Y Calzolari, Nicoletta
%Y Béchet, Frédéric
%Y Blache, Philippe
%Y Choukri, Khalid
%Y Cieri, Christopher
%Y Declerck, Thierry
%Y Goggi, Sara
%Y Isahara, Hitoshi
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Mazo, Hélène
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Twelfth Language Resources and Evaluation Conference
%D 2020
%8 May
%I European Language Resources Association
%C Marseille, France
%@ 979-10-95546-34-4
%G eng
%F melli-etal-2020-gm
%X We introduce the GM-RKB WikiText Error Correction Task for the automatic detection and correction of typographical errors in WikiText annotated pages. The included corpus is based on a snapshot of the GM-RKB domain-specific semantic wiki consisting of a large collection of concepts, personages, and publications primary centered on data mining and machine learning research topics. Numerous Wikipedia pages were also included as additional training data in the task‘s evaluation process. The corpus was then automatically updated to synthetically include realistic errors to produce a training and evaluation ground truth comparison. We designed and evaluated two supervised baseline WikiFixer error correction methods: (1) a naive approach based on a maximum likelihood character-level language model; (2) and an advanced model based on a sequence-to-sequence (seq2seq) neural network architecture. Both error correction models operated at a character level. When compared against an off-the-shelf word-level spell checker these methods showed a significant improvement in the task‘s performance – with the seq2seq-based model correcting a higher number of errors than it introduced. Finally, we published our data and code.
%U https://aclanthology.org/2020.lrec-1.295/
%P 2424-2430
Markdown (Informal)
[GM-RKB WikiText Error Correction Task and Baselines](https://aclanthology.org/2020.lrec-1.295/) (Melli et al., LREC 2020)
ACL
- Gabor Melli, Abdelrhman Eldallal, Bassim Lazem, and Olga Moreira. 2020. GM-RKB WikiText Error Correction Task and Baselines. In Proceedings of the Twelfth Language Resources and Evaluation Conference, pages 2424–2430, Marseille, France. European Language Resources Association.