@inproceedings{vasquez-rodriguez-etal-2022-benchmark,
title = "A Benchmark for Neural Readability Assessment of Texts in {S}panish",
author = "V{\'a}squez-Rodr{\'i}guez, Laura and
Cuenca-Jim{\'e}nez, Pedro-Manuel and
Morales-Esquivel, Sergio and
Alva-Manchego, Fernando",
editor = "{\v{S}}tajner, Sanja and
Saggion, Horacio and
Ferr{\'e}s, Daniel and
Shardlow, Matthew and
Sheang, Kim Cheng and
North, Kai and
Zampieri, Marcos and
Xu, Wei",
booktitle = "Proceedings of the Workshop on Text Simplification, Accessibility, and Readability (TSAR-2022)",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates (Virtual)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.tsar-1.18/",
doi = "10.18653/v1/2022.tsar-1.18",
pages = "188--198",
abstract = "We release a new benchmark for Automated Readability Assessment (ARA) of texts in Spanish. We combined existing corpora with suitable texts collected from the Web, thus creating the largest available dataset for ARA of Spanish texts. All data was pre-processed and categorised to allow experimenting with ARA models that make predictions at two (simple and complex) or three (basic, intermediate, and advanced) readability levels, and at two text granularities (paragraphs and sentences). An analysis based on readability indices shows that our proposed datasets groupings are suitable for their designated readability level. We use our benchmark to train neural ARA models based on BERT in zero-shot, few-shot, and cross-lingual settings. Results show that either a monolingual or multilingual pre-trained model can achieve good results when fine-tuned in language-specific data. In addition, all mod- els decrease their performance when predicting three classes instead of two, showing opportunities for the development of better ARA models for Spanish with existing resources."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="vasquez-rodriguez-etal-2022-benchmark">
<titleInfo>
<title>A Benchmark for Neural Readability Assessment of Texts in Spanish</title>
</titleInfo>
<name type="personal">
<namePart type="given">Laura</namePart>
<namePart type="family">Vásquez-Rodríguez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pedro-Manuel</namePart>
<namePart type="family">Cuenca-Jiménez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sergio</namePart>
<namePart type="family">Morales-Esquivel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fernando</namePart>
<namePart type="family">Alva-Manchego</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Text Simplification, Accessibility, and Readability (TSAR-2022)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sanja</namePart>
<namePart type="family">Štajner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Horacio</namePart>
<namePart type="family">Saggion</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Ferrés</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthew</namePart>
<namePart type="family">Shardlow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kim</namePart>
<namePart type="given">Cheng</namePart>
<namePart type="family">Sheang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai</namePart>
<namePart type="family">North</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcos</namePart>
<namePart type="family">Zampieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates (Virtual)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We release a new benchmark for Automated Readability Assessment (ARA) of texts in Spanish. We combined existing corpora with suitable texts collected from the Web, thus creating the largest available dataset for ARA of Spanish texts. All data was pre-processed and categorised to allow experimenting with ARA models that make predictions at two (simple and complex) or three (basic, intermediate, and advanced) readability levels, and at two text granularities (paragraphs and sentences). An analysis based on readability indices shows that our proposed datasets groupings are suitable for their designated readability level. We use our benchmark to train neural ARA models based on BERT in zero-shot, few-shot, and cross-lingual settings. Results show that either a monolingual or multilingual pre-trained model can achieve good results when fine-tuned in language-specific data. In addition, all mod- els decrease their performance when predicting three classes instead of two, showing opportunities for the development of better ARA models for Spanish with existing resources.</abstract>
<identifier type="citekey">vasquez-rodriguez-etal-2022-benchmark</identifier>
<identifier type="doi">10.18653/v1/2022.tsar-1.18</identifier>
<location>
<url>https://aclanthology.org/2022.tsar-1.18/</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>188</start>
<end>198</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Benchmark for Neural Readability Assessment of Texts in Spanish
%A Vásquez-Rodríguez, Laura
%A Cuenca-Jiménez, Pedro-Manuel
%A Morales-Esquivel, Sergio
%A Alva-Manchego, Fernando
%Y Štajner, Sanja
%Y Saggion, Horacio
%Y Ferrés, Daniel
%Y Shardlow, Matthew
%Y Sheang, Kim Cheng
%Y North, Kai
%Y Zampieri, Marcos
%Y Xu, Wei
%S Proceedings of the Workshop on Text Simplification, Accessibility, and Readability (TSAR-2022)
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates (Virtual)
%F vasquez-rodriguez-etal-2022-benchmark
%X We release a new benchmark for Automated Readability Assessment (ARA) of texts in Spanish. We combined existing corpora with suitable texts collected from the Web, thus creating the largest available dataset for ARA of Spanish texts. All data was pre-processed and categorised to allow experimenting with ARA models that make predictions at two (simple and complex) or three (basic, intermediate, and advanced) readability levels, and at two text granularities (paragraphs and sentences). An analysis based on readability indices shows that our proposed datasets groupings are suitable for their designated readability level. We use our benchmark to train neural ARA models based on BERT in zero-shot, few-shot, and cross-lingual settings. Results show that either a monolingual or multilingual pre-trained model can achieve good results when fine-tuned in language-specific data. In addition, all mod- els decrease their performance when predicting three classes instead of two, showing opportunities for the development of better ARA models for Spanish with existing resources.
%R 10.18653/v1/2022.tsar-1.18
%U https://aclanthology.org/2022.tsar-1.18/
%U https://doi.org/10.18653/v1/2022.tsar-1.18
%P 188-198
Markdown (Informal)
[A Benchmark for Neural Readability Assessment of Texts in Spanish](https://aclanthology.org/2022.tsar-1.18/) (Vásquez-Rodríguez et al., TSAR 2022)
ACL
- Laura Vásquez-Rodríguez, Pedro-Manuel Cuenca-Jiménez, Sergio Morales-Esquivel, and Fernando Alva-Manchego. 2022. A Benchmark for Neural Readability Assessment of Texts in Spanish. In Proceedings of the Workshop on Text Simplification, Accessibility, and Readability (TSAR-2022), pages 188–198, Abu Dhabi, United Arab Emirates (Virtual). Association for Computational Linguistics.