@inproceedings{ebrahim-joy-2023-source,
title = "Source Code Plagiarism Detection with Pre-Trained Model Embeddings and Automated Machine Learning",
author = "Ebrahim, Fahad and
Joy, Mike",
editor = "Mitkov, Ruslan and
Angelova, Galia",
booktitle = "Proceedings of the 14th International Conference on Recent Advances in Natural Language Processing",
month = sep,
year = "2023",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2023.ranlp-1.34",
pages = "301--309",
abstract = "Source code plagiarism is a critical ethical issue in computer science education where students use someone else{'}s work as their own. It can be treated as a binary classification problem where the output can be either: yes (plagiarism found) or no (plagiarism not found). In this research, we have taken the open-source dataset {`}SOCO{'}, which contains two programming languages (PLs), namely Java and C/C++ (although our method could be applied to any PL). Source codes should be converted to vector representations that capture both the syntax and semantics of the text, known as contextual embeddings. These embeddings would be generated using source code pre-trained models (CodePTMs). The cosine similarity scores of three different CodePTMs were selected as features. The classifier selection and parameter tuning were conducted with the assistance of Automated Machine Learning (AutoML). The selected classifiers were tested, initially on Java, and the proposed approach produced average to high results compared to other published research, and surpassed the baseline (the JPlag plagiarism detection tool). For C/C++, the approach outperformed other research work and produced the highest ranking score.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ebrahim-joy-2023-source">
<titleInfo>
<title>Source Code Plagiarism Detection with Pre-Trained Model Embeddings and Automated Machine Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Fahad</namePart>
<namePart type="family">Ebrahim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mike</namePart>
<namePart type="family">Joy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 14th International Conference on Recent Advances in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Source code plagiarism is a critical ethical issue in computer science education where students use someone else’s work as their own. It can be treated as a binary classification problem where the output can be either: yes (plagiarism found) or no (plagiarism not found). In this research, we have taken the open-source dataset ‘SOCO’, which contains two programming languages (PLs), namely Java and C/C++ (although our method could be applied to any PL). Source codes should be converted to vector representations that capture both the syntax and semantics of the text, known as contextual embeddings. These embeddings would be generated using source code pre-trained models (CodePTMs). The cosine similarity scores of three different CodePTMs were selected as features. The classifier selection and parameter tuning were conducted with the assistance of Automated Machine Learning (AutoML). The selected classifiers were tested, initially on Java, and the proposed approach produced average to high results compared to other published research, and surpassed the baseline (the JPlag plagiarism detection tool). For C/C++, the approach outperformed other research work and produced the highest ranking score.</abstract>
<identifier type="citekey">ebrahim-joy-2023-source</identifier>
<location>
<url>https://aclanthology.org/2023.ranlp-1.34</url>
</location>
<part>
<date>2023-09</date>
<extent unit="page">
<start>301</start>
<end>309</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Source Code Plagiarism Detection with Pre-Trained Model Embeddings and Automated Machine Learning
%A Ebrahim, Fahad
%A Joy, Mike
%Y Mitkov, Ruslan
%Y Angelova, Galia
%S Proceedings of the 14th International Conference on Recent Advances in Natural Language Processing
%D 2023
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F ebrahim-joy-2023-source
%X Source code plagiarism is a critical ethical issue in computer science education where students use someone else’s work as their own. It can be treated as a binary classification problem where the output can be either: yes (plagiarism found) or no (plagiarism not found). In this research, we have taken the open-source dataset ‘SOCO’, which contains two programming languages (PLs), namely Java and C/C++ (although our method could be applied to any PL). Source codes should be converted to vector representations that capture both the syntax and semantics of the text, known as contextual embeddings. These embeddings would be generated using source code pre-trained models (CodePTMs). The cosine similarity scores of three different CodePTMs were selected as features. The classifier selection and parameter tuning were conducted with the assistance of Automated Machine Learning (AutoML). The selected classifiers were tested, initially on Java, and the proposed approach produced average to high results compared to other published research, and surpassed the baseline (the JPlag plagiarism detection tool). For C/C++, the approach outperformed other research work and produced the highest ranking score.
%U https://aclanthology.org/2023.ranlp-1.34
%P 301-309
Markdown (Informal)
[Source Code Plagiarism Detection with Pre-Trained Model Embeddings and Automated Machine Learning](https://aclanthology.org/2023.ranlp-1.34) (Ebrahim & Joy, RANLP 2023)
ACL