Source code for panda_guard.role.attacks.gpt4_cipher

# encoding: utf-8
# Author    : Shen Sicheng
# Datetime  : 2024/12/02 16:30
# User      : yu
# Product   : PyCharm
# Project   : panda-guard
# File      : gpt4_cipher.py
# explain   : gpt4_cipher: https://arxiv.org/abs/2308.06463 ICLR 2024

from typing import Dict, List, Any
from dataclasses import dataclass, field
from panda_guard.role.attacks import BaseAttacker, BaseAttackerConfig

from panda_guard.utils import is_user_turn


[docs]@dataclass
class GPT4CipherAttackerConfig(BaseAttackerConfig):
    """
    Configuration for the Rewrite Attacker.

    :param attacker_cls: Class of the attacker, default is "RewriteAttacker".  
    :param attacker_name: Name of the attacker.  
    :param llm_config: Configuration for the language model. 
    :param llm_gen_config: Configuration for generating output with LLM.  
    :param rewrite_template: Template for LLM to mask offensive words
    """

    attacker_cls: str = field(default="ArtPromptAttacker")
    attacker_name: str = field(default="ArtPrompt")
    rewrite_template: str = field(default=None)



[docs]class GPT4CipherAttacker(BaseAttacker):
    """
    Rewrite Attacker Implementation for modifying user prompts to bypass restrictions.

    TIPS:
    mask_llm_url should be set in config file for generate mask words and mask instruction.
    Here we use GPT3.5-turbo as the mask LLM.

    Correspondingly, OPENAI_API_KEY should be set in environment variables.
    """

    def __init__(self, config: GPT4CipherAttackerConfig):
        super().__init__(config)

        self.rewrite_template = config.rewrite_template


[docs]    def caesar_cipher(self, text, shift=3)-> str:
        """
        To transfer a string into Caesar Cipher
        :param text: string waiting to be transferred
        :param shift: shift digits in Caesar Cipher
        """

        result = []


        for char in text:
            if char.isalpha():
                shift_base = 65 if char.isupper() else 97
                shifted_char = chr((ord(char) - shift_base + shift) % 26 + shift_base)
                result.append(shifted_char)
            else:
                result.append(char)

        return ''.join(result)

[docs]    def attack(
            self,
            messages: List[Dict[str, str]],
            **kwargs
    ) -> List[Dict[str, str]]:
        """
        Execute an attack by transfer the latest user prompt to ArtPrompt.

        :param messages: List of messages in the conversation.  
        :param kwargs: Additional parameters for the attack.  
        :return: Modified list of messages with the rewritten prompt.  
        """
        assert is_user_turn(messages)

        original_prompt = messages[-1]["content"]


        #
        # self.llm.generate(
        #     system_messages,
        #     self.llm_gen_config,
        # )

        caesar_cipher = self.caesar_cipher(original_prompt, 3)
        # prompt_messages = {
        #     "role": "user",
        #     "content": caesar_cipher,
        # }
        #
        # messages[-1]["role"] = "system"
        # messages[-1]["content"] = self.rewrite_template
        #
        # messages.append(prompt_messages)

        messages[-1]["content"] = self.rewrite_template + caesar_cipher

        print(self.rewrite_template + caesar_cipher)

        return messages