Source code for panda_guard.role.attacks.gptfuzzer_attack.fuzzer.selection

import random
import numpy as np

from panda_guard.role.attacks.gptfuzzer_attack.fuzzer.core import GPTFuzzer, PromptNode


[docs]class SelectPolicy: """ Abstract base class for different selection policies used in GPT fuzzing. :param fuzzer: The `GPTFuzzer` instance responsible for managing fuzzing and prompt nodes. """ def __init__(self, fuzzer: GPTFuzzer): self.fuzzer = fuzzer
[docs] def select(self) -> PromptNode: """ Selects a `PromptNode` based on a specific selection policy. This method must be implemented by subclasses. :return: A selected `PromptNode`. """ raise NotImplementedError( "SelectPolicy must implement select method.")
[docs] def update(self, prompt_nodes: 'list[PromptNode]'): """ Updates the selection policy based on the results of the selected prompt nodes. This method can be overridden by subclasses. :param prompt_nodes: A list of `PromptNode` objects to update the policy with. """ pass
[docs]class RoundRobinSelectPolicy(SelectPolicy): """ A round-robin selection policy where each prompt node is selected in a cyclic manner. :param fuzzer: The `GPTFuzzer` instance responsible for managing fuzzing and prompt nodes. """ def __init__(self, fuzzer: GPTFuzzer = None): super().__init__(fuzzer) self.index: int = 0 # Index for selecting prompt nodes in a round-robin fashion
[docs] def select(self) -> PromptNode: """ Selects a `PromptNode` in a round-robin manner, ensuring each node is selected once before looping back. :return: A `PromptNode` selected in round-robin fashion. """ seed = self.fuzzer.prompt_nodes[self.index] seed.visited_num += 1 return seed
[docs] def update(self, prompt_nodes: 'list[PromptNode]'): """ Updates the round-robin index to ensure the next node is selected. :param prompt_nodes: A list of `PromptNode` objects, which is used for updating the selection index. """ self.index = (self.index - 1 + len(self.fuzzer.prompt_nodes) ) % len(self.fuzzer.prompt_nodes)
[docs]class RandomSelectPolicy(SelectPolicy): """ A random selection policy that selects a `PromptNode` at random. :param fuzzer: The `GPTFuzzer` instance responsible for managing fuzzing and prompt nodes. """ def __init__(self, fuzzer: GPTFuzzer = None): super().__init__(fuzzer)
[docs] def select(self) -> PromptNode: """ Selects a `PromptNode` randomly from the available prompt nodes. :return: A randomly selected `PromptNode`. """ seed = random.choice(self.fuzzer.prompt_nodes) seed.visited_num += 1 return seed
[docs]class UCBSelectPolicy(SelectPolicy): """ Upper Confidence Bound (UCB) selection policy, which balances exploration and exploitation using UCB. :param explore_coeff: The coefficient that controls the exploration factor. :param fuzzer: The `GPTFuzzer` instance responsible for managing fuzzing and prompt nodes. """ def __init__(self, explore_coeff: float = 1.0, fuzzer: GPTFuzzer = None): super().__init__(fuzzer) self.step = 0 self.last_choice_index = None self.explore_coeff = explore_coeff self.rewards = [0 for _ in range(len(self.fuzzer.prompt_nodes))]
[docs] def select(self) -> PromptNode: """ Selects a `PromptNode` using the UCB algorithm, which balances exploration and exploitation. :return: A `PromptNode` selected based on the UCB algorithm. """ if len(self.fuzzer.prompt_nodes) > len(self.rewards): self.rewards.extend( [0 for _ in range(len(self.fuzzer.prompt_nodes) - len(self.rewards))]) self.step += 1 scores = np.zeros(len(self.fuzzer.prompt_nodes)) for i, prompt_node in enumerate(self.fuzzer.prompt_nodes): smooth_visited_num = prompt_node.visited_num + 1 scores[i] = self.rewards[i] / smooth_visited_num + \ self.explore_coeff * \ np.sqrt(2 * np.log(self.step) / smooth_visited_num) self.last_choice_index = np.argmax(scores) self.fuzzer.prompt_nodes[self.last_choice_index].visited_num += 1 return self.fuzzer.prompt_nodes[self.last_choice_index]
[docs] def update(self, prompt_nodes: 'list[PromptNode]'): """ Updates the reward for the last selected prompt node based on the number of jailbreaks. :param prompt_nodes: A list of `PromptNode` objects, used to calculate the rewards for the last selected node. """ succ_num = sum([prompt_node.num_jailbreak for prompt_node in prompt_nodes]) self.rewards[self.last_choice_index] += \ succ_num / len(self.fuzzer.questions)
[docs]class MCTSExploreSelectPolicy(SelectPolicy): """ A selection policy based on Monte Carlo Tree Search (MCTS) to explore and exploit nodes. :param fuzzer: The `GPTFuzzer` instance responsible for managing fuzzing and prompt nodes. :param ratio: Balance between exploration and exploitation in MCTS. :param alpha: Penalty for selecting nodes at deeper levels. :param beta: Minimum reward after applying the penalty. """ def __init__(self, fuzzer: GPTFuzzer = None, ratio=0.5, alpha=0.1, beta=0.2): super().__init__(fuzzer) self.step = 0 self.mctc_select_path: 'list[PromptNode]' = [] self.last_choice_index = None self.rewards = [] self.ratio = ratio self.alpha = alpha self.beta = beta
[docs] def select(self) -> PromptNode: """ Selects a `PromptNode` based on MCTS, balancing exploration and exploitation. :return: A `PromptNode` selected using the MCTS algorithm. """ self.step += 1 if len(self.fuzzer.prompt_nodes) > len(self.rewards): self.rewards.extend( [0 for _ in range(len(self.fuzzer.prompt_nodes) - len(self.rewards))]) self.mctc_select_path.clear() cur = max( self.fuzzer.initial_prompts_nodes, key=lambda pn: self.rewards[pn.index] / (pn.visited_num + 1) + self.ratio * np.sqrt(2 * np.log(self.step) / (pn.visited_num + 0.01)) ) self.mctc_select_path.append(cur) while len(cur.child) > 0: if np.random.rand() < self.alpha: break cur = max( cur.child, key=lambda pn: self.rewards[pn.index] / (pn.visited_num + 1) + self.ratio * np.sqrt(2 * np.log(self.step) / (pn.visited_num + 0.01)) ) self.mctc_select_path.append(cur) for pn in self.mctc_select_path: pn.visited_num += 1 self.last_choice_index = cur.index return cur
[docs] def update(self, prompt_nodes: 'list[PromptNode]'): """ Updates the rewards for the nodes in the MCTS path based on the success of the attack. :param prompt_nodes: A list of `PromptNode` objects, used to calculate the reward for the selected path. """ succ_num = sum([prompt_node.num_jailbreak for prompt_node in prompt_nodes]) last_choice_node = self.fuzzer.prompt_nodes[self.last_choice_index] for prompt_node in reversed(self.mctc_select_path): reward = succ_num / len(prompt_nodes) self.rewards[prompt_node.index] += reward * \ max(self.beta, (1 - 0.1 * last_choice_node.level))
[docs]class EXP3SelectPolicy(SelectPolicy): """ The EXP3 (Exponential Weights) selection policy, balancing exploration and exploitation using probability distribution. :param gamma: Exploration coefficient that controls the randomness. :param alpha: Learning rate for updating the weights. :param fuzzer: The `GPTFuzzer` instance responsible for managing fuzzing and prompt nodes. """ def __init__(self, gamma: float = 0.05, alpha: float = 25, fuzzer: GPTFuzzer = None): super().__init__(fuzzer) self.energy = self.fuzzer.energy self.gamma = gamma self.alpha = alpha self.last_choice_index = None self.weights = [1. for _ in range(len(self.fuzzer.prompt_nodes))] self.probs = [0. for _ in range(len(self.fuzzer.prompt_nodes))]
[docs] def select(self) -> PromptNode: """ Selects a `PromptNode` based on the EXP3 algorithm, which computes selection probabilities using weighted exploration. :return: A `PromptNode` selected based on the EXP3 algorithm. """ if len(self.fuzzer.prompt_nodes) > len(self.weights): self.weights.extend( [1. for _ in range(len(self.fuzzer.prompt_nodes) - len(self.weights))]) np_weights = np.array(self.weights) probs = (1 - self.gamma) * np_weights / np_weights.sum() + \ self.gamma / len(self.fuzzer.prompt_nodes) self.last_choice_index = np.random.choice( len(self.fuzzer.prompt_nodes), p=probs) self.fuzzer.prompt_nodes[self.last_choice_index].visited_num += 1 self.probs[self.last_choice_index] = probs[self.last_choice_index] return self.fuzzer.prompt_nodes[self.last_choice_index]
[docs] def update(self, prompt_nodes: 'list[PromptNode]'): """ Updates the weights and probabilities for the last selected node based on the success of the attack. :param prompt_nodes: A list of `PromptNode` objects, used to update the weights for the EXP3 algorithm. """ succ_num = sum([prompt_node.num_jailbreak for prompt_node in prompt_nodes]) r = 1 - succ_num / len(prompt_nodes) x = -1 * r / self.probs[self.last_choice_index] self.weights[self.last_choice_index] *= np.exp( self.alpha * x / len(self.fuzzer.prompt_nodes))