Source code for panda_guard.role.attacks.gpt4_cipher

# encoding: utf-8
# Author    : Shen Sicheng
# Datetime  : 2024/12/02 16:30
# User      : yu
# Product   : PyCharm
# Project   : panda-guard
# File      : gpt4_cipher.py
# explain   : gpt4_cipher: https://arxiv.org/abs/2308.06463 ICLR 2024

from typing import Dict, List, Any
from dataclasses import dataclass, field
from panda_guard.role.attacks import BaseAttacker, BaseAttackerConfig

from panda_guard.utils import is_user_turn


[docs]@dataclass class GPT4CipherAttackerConfig(BaseAttackerConfig): """ Configuration for the Rewrite Attacker. :param attacker_cls: Class of the attacker, default is "RewriteAttacker". :param attacker_name: Name of the attacker. :param llm_config: Configuration for the language model. :param llm_gen_config: Configuration for generating output with LLM. :param rewrite_template: Template for LLM to mask offensive words """ attacker_cls: str = field(default="ArtPromptAttacker") attacker_name: str = field(default="ArtPrompt") rewrite_template: str = field(default=None)
[docs]class GPT4CipherAttacker(BaseAttacker): """ Rewrite Attacker Implementation for modifying user prompts to bypass restrictions. TIPS: mask_llm_url should be set in config file for generate mask words and mask instruction. Here we use GPT3.5-turbo as the mask LLM. Correspondingly, OPENAI_API_KEY should be set in environment variables. """ def __init__(self, config: GPT4CipherAttackerConfig): super().__init__(config) self.rewrite_template = config.rewrite_template
[docs] def caesar_cipher(self, text, shift=3)-> str: """ To transfer a string into Caesar Cipher :param text: string waiting to be transferred :param shift: shift digits in Caesar Cipher """ result = [] for char in text: if char.isalpha(): shift_base = 65 if char.isupper() else 97 shifted_char = chr((ord(char) - shift_base + shift) % 26 + shift_base) result.append(shifted_char) else: result.append(char) return ''.join(result)
[docs] def attack( self, messages: List[Dict[str, str]], **kwargs ) -> List[Dict[str, str]]: """ Execute an attack by transfer the latest user prompt to ArtPrompt. :param messages: List of messages in the conversation. :param kwargs: Additional parameters for the attack. :return: Modified list of messages with the rewritten prompt. """ assert is_user_turn(messages) original_prompt = messages[-1]["content"] # # self.llm.generate( # system_messages, # self.llm_gen_config, # ) caesar_cipher = self.caesar_cipher(original_prompt, 3) # prompt_messages = { # "role": "user", # "content": caesar_cipher, # } # # messages[-1]["role"] = "system" # messages[-1]["content"] = self.rewrite_template # # messages.append(prompt_messages) messages[-1]["content"] = self.rewrite_template + caesar_cipher print(self.rewrite_template + caesar_cipher) return messages