@inproceedings{kim-etal-2024-robust, title = "Robust Safety Classifier Against Jailbreaking Attacks: Adversarial Prompt Shield", author = "Kim, Jinhwa and Derakhshan, Ali and Harris, Ian", editor = {Chung, Yi-Ling and Talat, Zeerak and Nozza, Debora and Plaza-del-Arco, Flor Miriam and R{\"o}ttger, Paul and Mostafazadeh Davani, Aida and Calabrese, Agostina}, booktitle = "Proceedings of the 8th Workshop on Online Abuse and Harms (WOAH 2024)", month = jun, year = "2024", address = "Mexico City, Mexico", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2024.woah-1.12/", doi = "10.18653/v1/2024.woah-1.12", pages = "159--170" }