Source code for panda_guard.role.attacks.gptfuzzer_attack.gptfuzz

from typing import Dict, List
from dataclasses import dataclass, field
from panda_guard.role.attacks import BaseAttacker, BaseAttackerConfig


from panda_guard.llms import (
    create_llm,
    BaseLLMConfig,
    LLMGenerateConfig,
)

import pandas as pd

from panda_guard.role.attacks.gptfuzzer_attack.fuzzer.selection import MCTSExploreSelectPolicy
from panda_guard.role.attacks.gptfuzzer_attack.fuzzer.mutator import (
    MutateRandomSinglePolicy,
    OpenAIMutatorCrossOver,
    OpenAIMutatorExpand,
    OpenAIMutatorGenerateSimilar,
    OpenAIMutatorRephrase,
    OpenAIMutatorShorten,
)

from panda_guard.role.attacks.gptfuzzer_attack.fuzzer.core import GPTFuzzer
from panda_guard.role.attacks.gptfuzzer_attack.utils.predict import RoBERTaPredictor

[docs]@dataclass class GPTFuzzAttackerConfig(BaseAttackerConfig): """ Configuration for the GPTFuzz Attacker. :param attacker_cls: Class of the attacker, default is "TransferAttacker". :param attacker_name: Name of the attacker. :param attacker_llm_config: Configuration of attacker llm. :param attacker_llm_gen_config: Generation configuration for the attacker's LLM. :param target_llm_config: Configuration of target llm. :param target_llm_gen_config: Generation configuration for the attacker's LLM. :param initial_seed: initial seed. :param predict_model: A model for determining whether a jailbreak attack has succeeded, with an output of 0 or 1. """ attacker_cls: str = field(default="GPTFuzzAttacker") attacker_name: str = field(default=None) attacker_llm_config: BaseLLMConfig = field(default_factory=BaseLLMConfig) attacker_llm_gen_config: LLMGenerateConfig = field(default=None) target_llm_config: BaseLLMConfig = field(default_factory=BaseLLMConfig) target_llm_gen_config: LLMGenerateConfig = field(default=None) initial_seed: list = field(default=None) predict_model: str = field(default=None)
[docs]class GPTFuzzAttacker(BaseAttacker): """ GPTFuzz Attacker Implementation that substitutes the user message with a pre-formulated attack prompt. Reference: Yu J, Lin X, Yu Z, et al. Gptfuzzer: Red teaming large language models with auto-generated jailbreak prompts[J]. arXiv preprint arXiv:2309.10253, 2023. :param config: Configuration for the GPTFuzzAttacker. """ def __init__(self, config: GPTFuzzAttackerConfig): super().__init__(config) self.config = config self.attack_llm = create_llm(config=config.attacker_llm_config) self.target_llm = create_llm(config=config.target_llm_config) self.initial_seed = pd.read_csv(config.initial_seed)["text"].tolist() self.predict_model = RoBERTaPredictor(config.predict_model)
[docs] def attack(self, messages: List[Dict[str, str]], **kwargs) -> List[Dict[str, str]]: """ Execute an attack by transferring a reformulated request into the conversation. :param messages: List of messages in the conversation. :param kwargs: Additional parameters for the attack, must include "request_reformulated". :return: Prompts containing harmful attacks on the target, is of the form “role: user, content: xx”. """ question = messages[0]["content"] fuzzer = GPTFuzzer( question=question, # target_model=openai_model, target=self.target_llm, predictor=self.predict_model, initial_seed=self.initial_seed, mutate_policy=MutateRandomSinglePolicy( [ OpenAIMutatorCrossOver( self.attack_llm, self.config.attacker_llm_gen_config ), OpenAIMutatorExpand( self.attack_llm, self.config.attacker_llm_gen_config ), OpenAIMutatorGenerateSimilar( self.attack_llm, self.config.attacker_llm_gen_config ), OpenAIMutatorRephrase( self.attack_llm, self.config.attacker_llm_gen_config ), OpenAIMutatorShorten( self.attack_llm, self.config.attacker_llm_gen_config ), ], concatentate=True, ), select_policy=MCTSExploreSelectPolicy(), energy=1, max_jailbreak=1, max_query=200, generate_in_batch=False, ) messages = fuzzer.run(self.config.target_llm_gen_config) return messages