@inproceedings{protar-nemeskey-2023-hupwkp,
title = "hu{PWKP}: A {H}ungarian Text Simplification Corpus",
author = "Pr{\'o}t{\'a}r, No{\'e}mi and
Nemeskey, D{\'a}vid M{\'a}rk",
editor = "Mitkov, Ruslan and
Angelova, Galia",
booktitle = "Proceedings of the 14th International Conference on Recent Advances in Natural Language Processing",
month = sep,
year = "2023",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2023.ranlp-1.97",
pages = "898--907",
abstract = "In this article we introduce huPWKP, the first parallel corpus consisting of Hungarian standard language-simplified sentence pairs. As Hungarian is a quite low-resource language in regards to text simplification, we opted for translating an already existing corpus, PWKP (Zhu et al., 2010), on which we performed some cleaning in order to improve its quality. We evaluated the corpus both with the help of human evaluators and by training a seq2seq model on both the Hungarian corpus and the original (cleaned) English corpus. The Hungarian model performed slightly worse in terms of automatic metrics; however, the English model attains a SARI score close to the state of the art on the official PWKP set. According to the human evaluation, the corpus performs at around 3 on a scale ranging from 1 to 5 in terms of information retention and increase in simplification and around 3.7 in terms of grammaticality.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="protar-nemeskey-2023-hupwkp">
<titleInfo>
<title>huPWKP: A Hungarian Text Simplification Corpus</title>
</titleInfo>
<name type="personal">
<namePart type="given">Noémi</namePart>
<namePart type="family">Prótár</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dávid</namePart>
<namePart type="given">Márk</namePart>
<namePart type="family">Nemeskey</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 14th International Conference on Recent Advances in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this article we introduce huPWKP, the first parallel corpus consisting of Hungarian standard language-simplified sentence pairs. As Hungarian is a quite low-resource language in regards to text simplification, we opted for translating an already existing corpus, PWKP (Zhu et al., 2010), on which we performed some cleaning in order to improve its quality. We evaluated the corpus both with the help of human evaluators and by training a seq2seq model on both the Hungarian corpus and the original (cleaned) English corpus. The Hungarian model performed slightly worse in terms of automatic metrics; however, the English model attains a SARI score close to the state of the art on the official PWKP set. According to the human evaluation, the corpus performs at around 3 on a scale ranging from 1 to 5 in terms of information retention and increase in simplification and around 3.7 in terms of grammaticality.</abstract>
<identifier type="citekey">protar-nemeskey-2023-hupwkp</identifier>
<location>
<url>https://aclanthology.org/2023.ranlp-1.97</url>
</location>
<part>
<date>2023-09</date>
<extent unit="page">
<start>898</start>
<end>907</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T huPWKP: A Hungarian Text Simplification Corpus
%A Prótár, Noémi
%A Nemeskey, Dávid Márk
%Y Mitkov, Ruslan
%Y Angelova, Galia
%S Proceedings of the 14th International Conference on Recent Advances in Natural Language Processing
%D 2023
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F protar-nemeskey-2023-hupwkp
%X In this article we introduce huPWKP, the first parallel corpus consisting of Hungarian standard language-simplified sentence pairs. As Hungarian is a quite low-resource language in regards to text simplification, we opted for translating an already existing corpus, PWKP (Zhu et al., 2010), on which we performed some cleaning in order to improve its quality. We evaluated the corpus both with the help of human evaluators and by training a seq2seq model on both the Hungarian corpus and the original (cleaned) English corpus. The Hungarian model performed slightly worse in terms of automatic metrics; however, the English model attains a SARI score close to the state of the art on the official PWKP set. According to the human evaluation, the corpus performs at around 3 on a scale ranging from 1 to 5 in terms of information retention and increase in simplification and around 3.7 in terms of grammaticality.
%U https://aclanthology.org/2023.ranlp-1.97
%P 898-907
Markdown (Informal)
[huPWKP: A Hungarian Text Simplification Corpus](https://aclanthology.org/2023.ranlp-1.97) (Prótár & Nemeskey, RANLP 2023)
ACL
- Noémi Prótár and Dávid Márk Nemeskey. 2023. huPWKP: A Hungarian Text Simplification Corpus. In Proceedings of the 14th International Conference on Recent Advances in Natural Language Processing, pages 898–907, Varna, Bulgaria. INCOMA Ltd., Shoumen, Bulgaria.