@inproceedings{wang-etal-2023-upton,
title = "{UPTON}: Preventing Authorship Leakage from Public Text Release via Data Poisoning",
author = "Wang, Ziyao and
Le, Thai and
Lee, Dongwon",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-emnlp.800",
doi = "10.18653/v1/2023.findings-emnlp.800",
pages = "11952--11965",
abstract = "Consider a scenario where an author (e.g., activist, whistle-blower) with many public writings wishes to write {``}anonymously{''} when attackers may have already built an authorship attribution (AA) model based off of public writings including those of the author. To enable her wish, we ask a question {``}can one make the publicly released writings, T , unattributable so that AA models trained on T cannot attribute its authorship well?{''} Toward this question, we present a novel solution, UPTON, that exploits black-box data poisoning methods to weaken the authorship features in training samples and make released texts unlearnable. It is different from previous obfuscation works (e.g., adversarial attacks that modify test samples or backdoor works that only change the model outputs when triggering words occur). Using four authorship datasets (IMDb10, IMDb64, Enron and WJO), we present empirical validation where UPTON successfully downgrades the accuracy of AA models to the impractical level (e.g., {\textasciitilde} 35{\%}) while keeping texts still readable (e.g., {\textgreater} 0.9 in BERTScore). UPTON remains effective to AA models that are already trained on available clean writings of authors.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2023-upton">
<titleInfo>
<title>UPTON: Preventing Authorship Leakage from Public Text Release via Data Poisoning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ziyao</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thai</namePart>
<namePart type="family">Le</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dongwon</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Consider a scenario where an author (e.g., activist, whistle-blower) with many public writings wishes to write “anonymously” when attackers may have already built an authorship attribution (AA) model based off of public writings including those of the author. To enable her wish, we ask a question “can one make the publicly released writings, T , unattributable so that AA models trained on T cannot attribute its authorship well?” Toward this question, we present a novel solution, UPTON, that exploits black-box data poisoning methods to weaken the authorship features in training samples and make released texts unlearnable. It is different from previous obfuscation works (e.g., adversarial attacks that modify test samples or backdoor works that only change the model outputs when triggering words occur). Using four authorship datasets (IMDb10, IMDb64, Enron and WJO), we present empirical validation where UPTON successfully downgrades the accuracy of AA models to the impractical level (e.g., ~ 35%) while keeping texts still readable (e.g., \textgreater 0.9 in BERTScore). UPTON remains effective to AA models that are already trained on available clean writings of authors.</abstract>
<identifier type="citekey">wang-etal-2023-upton</identifier>
<identifier type="doi">10.18653/v1/2023.findings-emnlp.800</identifier>
<location>
<url>https://aclanthology.org/2023.findings-emnlp.800</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>11952</start>
<end>11965</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T UPTON: Preventing Authorship Leakage from Public Text Release via Data Poisoning
%A Wang, Ziyao
%A Le, Thai
%A Lee, Dongwon
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Findings of the Association for Computational Linguistics: EMNLP 2023
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F wang-etal-2023-upton
%X Consider a scenario where an author (e.g., activist, whistle-blower) with many public writings wishes to write “anonymously” when attackers may have already built an authorship attribution (AA) model based off of public writings including those of the author. To enable her wish, we ask a question “can one make the publicly released writings, T , unattributable so that AA models trained on T cannot attribute its authorship well?” Toward this question, we present a novel solution, UPTON, that exploits black-box data poisoning methods to weaken the authorship features in training samples and make released texts unlearnable. It is different from previous obfuscation works (e.g., adversarial attacks that modify test samples or backdoor works that only change the model outputs when triggering words occur). Using four authorship datasets (IMDb10, IMDb64, Enron and WJO), we present empirical validation where UPTON successfully downgrades the accuracy of AA models to the impractical level (e.g., ~ 35%) while keeping texts still readable (e.g., \textgreater 0.9 in BERTScore). UPTON remains effective to AA models that are already trained on available clean writings of authors.
%R 10.18653/v1/2023.findings-emnlp.800
%U https://aclanthology.org/2023.findings-emnlp.800
%U https://doi.org/10.18653/v1/2023.findings-emnlp.800
%P 11952-11965
Markdown (Informal)
[UPTON: Preventing Authorship Leakage from Public Text Release via Data Poisoning](https://aclanthology.org/2023.findings-emnlp.800) (Wang et al., Findings 2023)
ACL