@inproceedings{elmallah-etal-2024-arabic,
title = "{A}rabic Diacritization Using Morphologically Informed Character-Level Model",
author = "Elmallah, Muhammad Morsy and
Reda, Mahmoud and
Darwish, Kareem and
El-Sheikh, Abdelrahman and
Elneima, Ashraf Hatim and
Aljubran, Murtadha and
Alsaeed, Nouf and
Mohammed, Reem and
Al-Badrashiny, Mohamed",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.128/",
pages = "1446--1454",
abstract = "Arabic diacritic recovery i.e. diacritization is necessary for proper vocalization and an enabler for downstream applications such as language learning and text to speech. Diacritics come in two varieties, namely: core-word diacritics and case endings. In this paper we introduce a highly effective morphologically informed character-level model that can recover both types of diacritics simultaneously. The model uses a Recurrent Neural Network (RNN) based architecture that takes in text as a sequence of characters, with markers for morphological segmentation, and outputs a sequence of diacritics. We also introduce a character-based morphological segmentation model that we train for Modern Standard Arabic (MSA) and dialectal Arabic. We demonstrate the efficacy of our diacritization model on Classical Arabic, MSA, and two dialectal (Moroccan and Tunisian) texts. We achieve the lowest reported word-level diacritization error rate for MSA (3.4{\%}), match the best results for Classical Arabic (5.4{\%}), and report competitive results for dialectal Arabic."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="elmallah-etal-2024-arabic">
<titleInfo>
<title>Arabic Diacritization Using Morphologically Informed Character-Level Model</title>
</titleInfo>
<name type="personal">
<namePart type="given">Muhammad</namePart>
<namePart type="given">Morsy</namePart>
<namePart type="family">Elmallah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mahmoud</namePart>
<namePart type="family">Reda</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kareem</namePart>
<namePart type="family">Darwish</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdelrahman</namePart>
<namePart type="family">El-Sheikh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ashraf</namePart>
<namePart type="given">Hatim</namePart>
<namePart type="family">Elneima</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Murtadha</namePart>
<namePart type="family">Aljubran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nouf</namePart>
<namePart type="family">Alsaeed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Reem</namePart>
<namePart type="family">Mohammed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohamed</namePart>
<namePart type="family">Al-Badrashiny</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Arabic diacritic recovery i.e. diacritization is necessary for proper vocalization and an enabler for downstream applications such as language learning and text to speech. Diacritics come in two varieties, namely: core-word diacritics and case endings. In this paper we introduce a highly effective morphologically informed character-level model that can recover both types of diacritics simultaneously. The model uses a Recurrent Neural Network (RNN) based architecture that takes in text as a sequence of characters, with markers for morphological segmentation, and outputs a sequence of diacritics. We also introduce a character-based morphological segmentation model that we train for Modern Standard Arabic (MSA) and dialectal Arabic. We demonstrate the efficacy of our diacritization model on Classical Arabic, MSA, and two dialectal (Moroccan and Tunisian) texts. We achieve the lowest reported word-level diacritization error rate for MSA (3.4%), match the best results for Classical Arabic (5.4%), and report competitive results for dialectal Arabic.</abstract>
<identifier type="citekey">elmallah-etal-2024-arabic</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.128/</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>1446</start>
<end>1454</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Arabic Diacritization Using Morphologically Informed Character-Level Model
%A Elmallah, Muhammad Morsy
%A Reda, Mahmoud
%A Darwish, Kareem
%A El-Sheikh, Abdelrahman
%A Elneima, Ashraf Hatim
%A Aljubran, Murtadha
%A Alsaeed, Nouf
%A Mohammed, Reem
%A Al-Badrashiny, Mohamed
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F elmallah-etal-2024-arabic
%X Arabic diacritic recovery i.e. diacritization is necessary for proper vocalization and an enabler for downstream applications such as language learning and text to speech. Diacritics come in two varieties, namely: core-word diacritics and case endings. In this paper we introduce a highly effective morphologically informed character-level model that can recover both types of diacritics simultaneously. The model uses a Recurrent Neural Network (RNN) based architecture that takes in text as a sequence of characters, with markers for morphological segmentation, and outputs a sequence of diacritics. We also introduce a character-based morphological segmentation model that we train for Modern Standard Arabic (MSA) and dialectal Arabic. We demonstrate the efficacy of our diacritization model on Classical Arabic, MSA, and two dialectal (Moroccan and Tunisian) texts. We achieve the lowest reported word-level diacritization error rate for MSA (3.4%), match the best results for Classical Arabic (5.4%), and report competitive results for dialectal Arabic.
%U https://aclanthology.org/2024.lrec-main.128/
%P 1446-1454
Markdown (Informal)
[Arabic Diacritization Using Morphologically Informed Character-Level Model](https://aclanthology.org/2024.lrec-main.128/) (Elmallah et al., LREC-COLING 2024)
ACL
- Muhammad Morsy Elmallah, Mahmoud Reda, Kareem Darwish, Abdelrahman El-Sheikh, Ashraf Hatim Elneima, Murtadha Aljubran, Nouf Alsaeed, Reem Mohammed, and Mohamed Al-Badrashiny. 2024. Arabic Diacritization Using Morphologically Informed Character-Level Model. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), pages 1446–1454, Torino, Italia. ELRA and ICCL.