Source code for panda_guard.role.attacks.art_prompt

# encoding: utf-8
# Author    : Shen Sicheng
# Datetime  : 2024/12/02 16:30
# User      : yu
# Product   : PyCharm
# Project   : panda-guard
# File      :art_prompt.py
# explain   :

import json
import os
from typing import Dict, List, Any
from dataclasses import dataclass, field
from openai import OpenAI
from panda_guard.role.attacks import BaseAttacker, BaseAttackerConfig

from panda_guard.llms import create_llm, BaseLLMConfig, LLMGenerateConfig
from panda_guard.utils import is_user_turn


[docs]@dataclass class ArtPromptAttackerConfig(BaseAttackerConfig): """ Configuration for the Rewrite Attacker. :param attacker_cls: Class of the attacker, default is "RewriteAttacker". :param attacker_name: Name of the attacker. :param llm_config: Configuration for the language model. :param llm_gen_config: Configuration for generating output with LLM. :param rewrite_template: Template for LLM to mask offensive words """ attacker_cls: str = field(default="ArtPromptAttacker") attacker_name: str = field(default="ArtPrompt") mask_llm_url: str = field(default=None) llm_gen_config: LLMGenerateConfig = field(default=None) rewrite_template: str = field(default=None)
[docs]class ArtPromptAttacker(BaseAttacker): """ Rewrite Attacker Implementation for modifying user prompts to bypass restrictions. TIPS: mask_llm_url should be set in config file for generate mask words and mask instruction. Here we use GPT3.5-turbo as the mask LLM. Correspondingly, OPENAI_API_KEY should be set in environment variables. """ def __init__(self, config: ArtPromptAttackerConfig): super().__init__(config) self.mask_llm_url = config.mask_llm_url self.llm_gen_config = config.llm_gen_config self.rewrite_template = config.rewrite_template
[docs] def parse(self, response): """ To parse the response from LLM to get masked words and masked instruction. :param response: The response from LLM to [MASK] sensitive word. :return : masked_words: list, masked words :return : masked_instruction: str, masked instruction """ masked_words = [] # may be multiple musked words if "### Masked words:" in response: masked_words = response.split("### Masked words:")[1].split("###")[0].strip() idx1 = masked_words.find("[") idx2 = masked_words.find("]") masked_words = masked_words[idx1 + 1:idx2].split(",") masked_words = [word.strip() for word in masked_words] masked_instruction = "" if "### Masked instruction:" in response: masked_instruction = response.split("### Masked instruction:")[1].strip() return masked_words, masked_instruction
[docs] def ASCII_sub(self, word, font_seed=0): """ To transfer masked words to ASCII art. :param word: The word to be converted to ASCII art. :param font_seed: int(0-233) The font of the ASCII art. Default is '1943'. :return: string ASCII art, """ with open('/home/shensicheng/code/jailbreak/data/ArtPrompt/vitc-s.json', 'r') as f: ascii_data = json.load(f) try: assert 0 <= font_seed < 234, "font_seed should be an integer in range 0-233" except AssertionError as e: print(f"Assertion failed: {e}") font_seed = max(0, min(font_seed, 233)) ascii_art_dict = { item["text"]: item["art_text"] for item in ascii_data if font_seed * 36 <= item["id"] < (font_seed + 1) * 36 } font_lines = ascii_art_dict["A"].count('\n') art_lines = [""] * font_lines for char in word.upper(): if char in ascii_art_dict: art = ascii_art_dict[char].split("\n") for i in range(font_lines): art_lines[i] += art[i] + " " return "\n"+ "\n".join(art_lines) + "\n" # ASCII art
[docs] def artprompt_gen(self, masked_words, masked_instruction): """ To generate ArtPrompt :param masked word: list masked words in ascii art :param masked_instructioin: str masked instruction with [MASK%] :return: str masked instruction with ascii art """ ascii_words = [self.ASCII_sub(word) for word in masked_words] if len(ascii_words) == 1: masked_instruction = masked_instruction.replace(f"[MASK]", ascii_words[0]) else: for i, ascii_art in enumerate(ascii_words): masked_instruction = masked_instruction.replace(f"[MASK{i + 1}]", ascii_art) return masked_instruction
[docs] def ascii_art(self, prompt: str) -> str: # TODO @shensicheng: Use panda_guard.llms.create_llm to Create a LLM api_key = os.getenv("OPENAI_API_KEY") client = OpenAI( api_key = api_key, base_url = self.mask_llm_url ) system_message = { "role": "system", "content": self.rewrite_template } user_message = { "role": "user", "content": prompt } completion = client.chat.completions.create( model="gpt-3.5-turbo", messages=[system_message, user_message] ) masked_prompt = completion.choices[0].message.content masked_words, masked_instruction = self.parse(masked_prompt) artprompt = self.artprompt_gen(masked_words, masked_instruction) print(artprompt) return artprompt
[docs] def attack( self, messages: List[Dict[str, str]], **kwargs ) -> List[Dict[str, str]]: """ Execute an attack by transfer the latest user prompt to ArtPrompt. :param messages: List of messages in the conversation. :param kwargs: Additional parameters for the attack. :return: Modified list of messages with the rewritten prompt. """ assert is_user_turn(messages) art_prompt = self.ascii_art(messages[-1]["content"]) messages[-1]["content"] = art_prompt return messages