@inproceedings{zhou-etal-2023-textobfuscator,
title = "{T}ext{O}bfuscator: Making Pre-trained Language Model a Privacy Protector via Obfuscating Word Representations",
author = "Zhou, Xin and
Lu, Yi and
Ma, Ruotian and
Gui, Tao and
Wang, Yuran and
Ding, Yong and
Zhang, Yibo and
Zhang, Qi and
Huang, Xuanjing",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-acl.337/",
doi = "10.18653/v1/2023.findings-acl.337",
pages = "5459--5473",
abstract = "In real-world applications, pre-trained language models are typically deployed on the cloud, allowing clients to upload data and perform compute-intensive inference remotely. To avoid sharing sensitive data directly with service providers, clients can upload numerical representations rather than plain text to the cloud. However, recent text reconstruction techniques have demonstrated that it is possible to transform representations into original words, suggesting that privacy risk remains. In this paper, we propose TextObfuscator, a novel framework for protecting inference privacy by applying random perturbations to clustered representations. The random perturbations make the representations indistinguishable from surrounding clustered representations, thus obscuring word information while retaining the original word functionality. To achieve this, we utilize prototypes to learn clustered representation, where tokens of similar functionality are encouraged to be closer to the same prototype during training. Additionally, we design different methods to find prototypes for token-level and sentence-level tasks, which can improve performance by incorporating semantic and task information. Experimental results on token and sentence classification tasks show that TextObfuscator achieves improvement over compared methods without increasing inference cost."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhou-etal-2023-textobfuscator">
<titleInfo>
<title>TextObfuscator: Making Pre-trained Language Model a Privacy Protector via Obfuscating Word Representations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xin</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yi</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruotian</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tao</namePart>
<namePart type="family">Gui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuran</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yong</namePart>
<namePart type="family">Ding</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yibo</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qi</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuanjing</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In real-world applications, pre-trained language models are typically deployed on the cloud, allowing clients to upload data and perform compute-intensive inference remotely. To avoid sharing sensitive data directly with service providers, clients can upload numerical representations rather than plain text to the cloud. However, recent text reconstruction techniques have demonstrated that it is possible to transform representations into original words, suggesting that privacy risk remains. In this paper, we propose TextObfuscator, a novel framework for protecting inference privacy by applying random perturbations to clustered representations. The random perturbations make the representations indistinguishable from surrounding clustered representations, thus obscuring word information while retaining the original word functionality. To achieve this, we utilize prototypes to learn clustered representation, where tokens of similar functionality are encouraged to be closer to the same prototype during training. Additionally, we design different methods to find prototypes for token-level and sentence-level tasks, which can improve performance by incorporating semantic and task information. Experimental results on token and sentence classification tasks show that TextObfuscator achieves improvement over compared methods without increasing inference cost.</abstract>
<identifier type="citekey">zhou-etal-2023-textobfuscator</identifier>
<identifier type="doi">10.18653/v1/2023.findings-acl.337</identifier>
<location>
<url>https://aclanthology.org/2023.findings-acl.337/</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>5459</start>
<end>5473</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T TextObfuscator: Making Pre-trained Language Model a Privacy Protector via Obfuscating Word Representations
%A Zhou, Xin
%A Lu, Yi
%A Ma, Ruotian
%A Gui, Tao
%A Wang, Yuran
%A Ding, Yong
%A Zhang, Yibo
%A Zhang, Qi
%A Huang, Xuanjing
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Findings of the Association for Computational Linguistics: ACL 2023
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F zhou-etal-2023-textobfuscator
%X In real-world applications, pre-trained language models are typically deployed on the cloud, allowing clients to upload data and perform compute-intensive inference remotely. To avoid sharing sensitive data directly with service providers, clients can upload numerical representations rather than plain text to the cloud. However, recent text reconstruction techniques have demonstrated that it is possible to transform representations into original words, suggesting that privacy risk remains. In this paper, we propose TextObfuscator, a novel framework for protecting inference privacy by applying random perturbations to clustered representations. The random perturbations make the representations indistinguishable from surrounding clustered representations, thus obscuring word information while retaining the original word functionality. To achieve this, we utilize prototypes to learn clustered representation, where tokens of similar functionality are encouraged to be closer to the same prototype during training. Additionally, we design different methods to find prototypes for token-level and sentence-level tasks, which can improve performance by incorporating semantic and task information. Experimental results on token and sentence classification tasks show that TextObfuscator achieves improvement over compared methods without increasing inference cost.
%R 10.18653/v1/2023.findings-acl.337
%U https://aclanthology.org/2023.findings-acl.337/
%U https://doi.org/10.18653/v1/2023.findings-acl.337
%P 5459-5473
Markdown (Informal)
[TextObfuscator: Making Pre-trained Language Model a Privacy Protector via Obfuscating Word Representations](https://aclanthology.org/2023.findings-acl.337/) (Zhou et al., Findings 2023)
ACL
- Xin Zhou, Yi Lu, Ruotian Ma, Tao Gui, Yuran Wang, Yong Ding, Yibo Zhang, Qi Zhang, and Xuanjing Huang. 2023. TextObfuscator: Making Pre-trained Language Model a Privacy Protector via Obfuscating Word Representations. In Findings of the Association for Computational Linguistics: ACL 2023, pages 5459–5473, Toronto, Canada. Association for Computational Linguistics.