Source code for panda_guard.role.attacks.art_prompt

# encoding: utf-8
# Author    : Shen Sicheng
# Datetime  : 2024/12/02 16:30
# User      : yu
# Product   : PyCharm
# Project   : panda-guard
# File      :art_prompt.py
# explain   :

import json
import os
from typing import Dict, List, Any
from dataclasses import dataclass, field
from openai import OpenAI
from panda_guard.role.attacks import BaseAttacker, BaseAttackerConfig

from panda_guard.llms import create_llm, BaseLLMConfig, LLMGenerateConfig
from panda_guard.utils import is_user_turn


[docs]@dataclass
class ArtPromptAttackerConfig(BaseAttackerConfig):
    """
    Configuration for the Rewrite Attacker.

    :param attacker_cls: Class of the attacker, default is "RewriteAttacker".  
    :param attacker_name: Name of the attacker.  
    :param llm_config: Configuration for the language model.  
    :param llm_gen_config: Configuration for generating output with LLM. 
    :param rewrite_template: Template for LLM to mask offensive words
    """

    attacker_cls: str = field(default="ArtPromptAttacker")
    attacker_name: str = field(default="ArtPrompt")
    mask_llm_url: str = field(default=None)
    llm_gen_config: LLMGenerateConfig = field(default=None)
    rewrite_template: str = field(default=None)



[docs]class ArtPromptAttacker(BaseAttacker):
    """
    Rewrite Attacker Implementation for modifying user prompts to bypass restrictions.

    TIPS:
    mask_llm_url should be set in config file for generate mask words and mask instruction.
    Here we use GPT3.5-turbo as the mask LLM.

    Correspondingly, OPENAI_API_KEY should be set in environment variables.
    """

    def __init__(self, config: ArtPromptAttackerConfig):
        super().__init__(config)
        self.mask_llm_url = config.mask_llm_url
        self.llm_gen_config = config.llm_gen_config
        self.rewrite_template = config.rewrite_template

[docs]    def parse(self, response):
        """
        To parse the response from LLM to get masked words and masked instruction.
        :param response: The response from LLM to [MASK] sensitive word.
        :return : masked_words: list, masked words
        :return : masked_instruction: str, masked instruction
        """
        masked_words = []  # may be multiple musked words
        if "### Masked words:" in response:
            masked_words = response.split("### Masked words:")[1].split("###")[0].strip()
            idx1 = masked_words.find("[")
            idx2 = masked_words.find("]")
            masked_words = masked_words[idx1 + 1:idx2].split(",")
            masked_words = [word.strip() for word in masked_words]

        masked_instruction = ""
        if "### Masked instruction:" in response:
            masked_instruction = response.split("### Masked instruction:")[1].strip()

        return masked_words, masked_instruction

[docs]    def ASCII_sub(self, word, font_seed=0):
        """
        To transfer masked words to ASCII art.
        :param word: The word to be converted to ASCII art.
        :param font_seed: int(0-233) The font of the ASCII art. Default is '1943'.
        :return: string   ASCII art,
        """
        with open('/home/shensicheng/code/jailbreak/data/ArtPrompt/vitc-s.json', 'r') as f:
            ascii_data = json.load(f)

        try:
            assert 0 <= font_seed < 234, "font_seed should be an integer in range 0-233"
        except AssertionError as e:
            print(f"Assertion failed: {e}")
            font_seed = max(0, min(font_seed, 233))

        ascii_art_dict = {
            item["text"]: item["art_text"]
            for item in ascii_data
            if font_seed * 36 <= item["id"] < (font_seed + 1) * 36
        }

        font_lines = ascii_art_dict["A"].count('\n')
        art_lines = [""] * font_lines

        for char in word.upper():
            if char in ascii_art_dict:
                art = ascii_art_dict[char].split("\n")
                for i in range(font_lines):
                    art_lines[i] += art[i] + "  "

        return "\n"+ "\n".join(art_lines) + "\n"  # ASCII art

[docs]    def artprompt_gen(self, masked_words, masked_instruction):
        """
        To generate ArtPrompt
        :param masked word: list    masked words in ascii art
        :param masked_instructioin: str   masked instruction with [MASK%]
        :return: str  masked instruction with ascii art
        """
        ascii_words = [self.ASCII_sub(word) for word in masked_words]
        if len(ascii_words) == 1:
            masked_instruction = masked_instruction.replace(f"[MASK]", ascii_words[0])
        else:
            for i, ascii_art in enumerate(ascii_words):
                masked_instruction = masked_instruction.replace(f"[MASK{i + 1}]", ascii_art)
        return masked_instruction

[docs]    def ascii_art(self, prompt: str) -> str:

        # TODO @shensicheng: Use panda_guard.llms.create_llm to Create a LLM
        api_key = os.getenv("OPENAI_API_KEY")
        client = OpenAI(
            api_key = api_key,
            base_url = self.mask_llm_url
        )

        system_message = {
            "role": "system",
            "content": self.rewrite_template
        }

        user_message = {
            "role": "user",
            "content": prompt
        }

        completion = client.chat.completions.create(
                        model="gpt-3.5-turbo",
                        messages=[system_message, user_message]
        )

        masked_prompt = completion.choices[0].message.content
        masked_words, masked_instruction = self.parse(masked_prompt)
        artprompt = self.artprompt_gen(masked_words, masked_instruction)
        print(artprompt)
        return artprompt

[docs]    def attack(
            self,
            messages: List[Dict[str, str]],
            **kwargs
    ) -> List[Dict[str, str]]:
        """
        Execute an attack by transfer the latest user prompt to ArtPrompt.

        :param messages: List of messages in the conversation.  
        :param kwargs: Additional parameters for the attack.  
        :return: Modified list of messages with the rewritten prompt.  
        """
        assert is_user_turn(messages)
        art_prompt = self.ascii_art(messages[-1]["content"])
        messages[-1]["content"] = art_prompt

        return messages