@inproceedings{alotaibi-joy-2021-english,
title = "{E}nglish-{A}rabic Cross-language Plagiarism Detection",
author = "Alotaibi, Naif and
Joy, Mike",
editor = "Mitkov, Ruslan and
Angelova, Galia",
booktitle = "Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021)",
month = sep,
year = "2021",
address = "Held Online",
publisher = "INCOMA Ltd.",
url = "https://aclanthology.org/2021.ranlp-1.6",
pages = "44--52",
abstract = "The advancement of the web and information technology has contributed to the rapid growth of digital libraries and automatic machine translation tools which easily translate texts from one language into another. These have increased the content accessible in different languages, which results in easily performing translated plagiarism, which are referred to as {``}cross-language plagiarism{''}. Recognition of plagiarism among texts in different languages is more challenging than identifying plagiarism within a corpus written in the same language. This paper proposes a new technique for enhancing English-Arabic cross-language plagiarism detection at the sentence level. This technique is based on semantic and syntactic feature extraction using word order, word embedding and word alignment with multilingual encoders. Those features, and their combination with different machine learning (ML) algorithms, are then used in order to aid the task of classifying sentences as either plagiarized or non-plagiarized. The proposed approach has been deployed and assessed using datasets presented at SemEval-2017. Analysis of experimental data demonstrates that utilizing extracted features and their combinations with various ML classifiers achieves promising results.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="alotaibi-joy-2021-english">
<titleInfo>
<title>English-Arabic Cross-language Plagiarism Detection</title>
</titleInfo>
<name type="personal">
<namePart type="given">Naif</namePart>
<namePart type="family">Alotaibi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mike</namePart>
<namePart type="family">Joy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd.</publisher>
<place>
<placeTerm type="text">Held Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The advancement of the web and information technology has contributed to the rapid growth of digital libraries and automatic machine translation tools which easily translate texts from one language into another. These have increased the content accessible in different languages, which results in easily performing translated plagiarism, which are referred to as “cross-language plagiarism”. Recognition of plagiarism among texts in different languages is more challenging than identifying plagiarism within a corpus written in the same language. This paper proposes a new technique for enhancing English-Arabic cross-language plagiarism detection at the sentence level. This technique is based on semantic and syntactic feature extraction using word order, word embedding and word alignment with multilingual encoders. Those features, and their combination with different machine learning (ML) algorithms, are then used in order to aid the task of classifying sentences as either plagiarized or non-plagiarized. The proposed approach has been deployed and assessed using datasets presented at SemEval-2017. Analysis of experimental data demonstrates that utilizing extracted features and their combinations with various ML classifiers achieves promising results.</abstract>
<identifier type="citekey">alotaibi-joy-2021-english</identifier>
<location>
<url>https://aclanthology.org/2021.ranlp-1.6</url>
</location>
<part>
<date>2021-09</date>
<extent unit="page">
<start>44</start>
<end>52</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T English-Arabic Cross-language Plagiarism Detection
%A Alotaibi, Naif
%A Joy, Mike
%Y Mitkov, Ruslan
%Y Angelova, Galia
%S Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021)
%D 2021
%8 September
%I INCOMA Ltd.
%C Held Online
%F alotaibi-joy-2021-english
%X The advancement of the web and information technology has contributed to the rapid growth of digital libraries and automatic machine translation tools which easily translate texts from one language into another. These have increased the content accessible in different languages, which results in easily performing translated plagiarism, which are referred to as “cross-language plagiarism”. Recognition of plagiarism among texts in different languages is more challenging than identifying plagiarism within a corpus written in the same language. This paper proposes a new technique for enhancing English-Arabic cross-language plagiarism detection at the sentence level. This technique is based on semantic and syntactic feature extraction using word order, word embedding and word alignment with multilingual encoders. Those features, and their combination with different machine learning (ML) algorithms, are then used in order to aid the task of classifying sentences as either plagiarized or non-plagiarized. The proposed approach has been deployed and assessed using datasets presented at SemEval-2017. Analysis of experimental data demonstrates that utilizing extracted features and their combinations with various ML classifiers achieves promising results.
%U https://aclanthology.org/2021.ranlp-1.6
%P 44-52
Markdown (Informal)
[English-Arabic Cross-language Plagiarism Detection](https://aclanthology.org/2021.ranlp-1.6) (Alotaibi & Joy, RANLP 2021)
ACL