Source code for panda_guard.role.defenses.smoothllm

# encoding: utf-8
# Author    : Floyed<Floyed_Shen@outlook.com>
# Datetime  : 2024/9/2 20:13
# User      : yu
# Product   : PyCharm
# Project   : panda-guard
# File      : smoothllm.py
# explain   :

from typing import Dict, List
import string
import random
from copy import deepcopy
from dataclasses import dataclass, field
from panda_guard.role.defenses import BaseDefender, BaseDefenderConfig

from panda_guard.utils import is_user_turn


[docs]@dataclass
class SmoothLLMDefenderConfig(BaseDefenderConfig):
    """
    Configuration for SmoothLLMDefender.

    :param defender_cls: Class of the defender, default is "SmoothLLMDefender".
    :param perturbation_type: Type of perturbation to apply, default is "swap".
    :param perturbation_ratio: Ratio of the prompt to perturb, default is 0.1.
    :param num_perturbations: Number of perturbed prompts to generate, default is 3.
    :param batch_inference: Boolean flag indicating whether batch inference should be used, default is True.
    """
    defender_cls: str = field(default="SmoothLLMDefender")
    perturbation_type: str = field(default="swap")
    perturbation_ratio: float = field(default=0.1)
    num_perturbations: int = field(default=3)
    batch_inference: bool = field(default=True)  # New option for batch inference



[docs]class SmoothLLMDefender(BaseDefender):
    """
    SmoothLLMDefender applies perturbations to defend against jailbreak attacks.

    Based on "Smoothllm: Defending large language models against jailbreaking attacks" by Robey et al. (2023).
    Paper link: https://arxiv.org/abs/2310.03684

    :param config: Configuration for SmoothLLMDefender.
    """

    def __init__(self, config: SmoothLLMDefenderConfig):
        super().__init__(config)
        self.perturbation_type = config.perturbation_type
        self.perturbation_ratio = config.perturbation_ratio
        self.num_perturbations = config.num_perturbations
        self.batch_inference = config.batch_inference  # Store the batch_inference option
        self.alphabet = string.printable

[docs]    def defense(
            self,
            messages: List[Dict[str, str]] = None,
    ) -> List[Dict[str, str]]:
        """
        Apply SmoothLLM defense by generating multiple perturbed versions of the user's message and analyzing the responses.

        :param messages: List of messages to defend against jailbreak attacks.
        :return: List of messages after applying SmoothLLM defense.
        """
        assert is_user_turn(messages), "It must be the user's turn to perform defense."

        # Perturb the last message from the user
        perturbed_prompts = [
            self._random_perturb(messages[-1]['content'])
            for _ in range(self.num_perturbations)
        ]
        # If batch inference is enabled, use batch_generate
        if self.batch_inference:
            batch_messages = [deepcopy(messages) for _ in perturbed_prompts]
            for i, prompt in enumerate(perturbed_prompts):
                batch_messages[i][-1]['content'] = prompt

            perturbed_outputs = self.target_llm.batch_generate(
                batch_messages, self.target_llm_gen_config
            )

            # Extract the relevant content from the batch results
            perturbed_outputs = [output[-1]['content'] for output in perturbed_outputs]

        else:
            # Sequentially generate outputs for each perturbed prompt
            perturbed_outputs = []
            for prompt in perturbed_prompts:
                perturbed_messages = messages.copy()
                perturbed_messages[-1]['content'] = prompt

                generated_responses = self.target_llm.generate(
                    perturbed_messages,
                    self.target_llm_gen_config
                )
                response = generated_responses[-1]['content']
                if response is None:
                    response = ""
                perturbed_outputs.append(response)

        # Analyze outputs and determine if the LLM was jailbroken
        are_jailbroken = [self._is_jailbroken(output) for output in perturbed_outputs]

        # Determine the majority decision
        jailbreak_majority = sum(are_jailbroken) > len(are_jailbroken) / 2

        # Filter outputs that match the majority result
        final_outputs = [
            output for output, jb in zip(perturbed_outputs, are_jailbroken)
            if jb == jailbreak_majority
        ]

        # Choose a response that aligns with the majority vote
        chosen_output = random.choice(final_outputs) if final_outputs else messages[-1]['content']

        # Modify the original message to reflect the chosen response
        messages.append({
            "role": "assistant",
            "content": chosen_output
        })

        return messages

    def _random_perturb(self, prompt: str) -> str:
        """
        Applies a random perturbation to the prompt based on the chosen method.

        :param prompt: The prompt to be perturbed.
        :return: The perturbed prompt.
        """
        if self.perturbation_type == "swap":
            return self._swap_perturbation(prompt)
        elif self.perturbation_type == "insert":
            return self._insert_perturbation(prompt)
        elif self.perturbation_type == "patch":
            return self._patch_perturbation(prompt)
        else:
            raise ValueError(f"Unknown perturbation type: {self.perturbation_type}")

    def _swap_perturbation(self, s: str) -> str:
        """
        Apply a swap perturbation to randomly replace characters in the prompt.

        :param s: The prompt to be perturbed.
        :return: The perturbed prompt with swapped characters.
        """
        list_s = list(s)
        sampled_indices = random.sample(range(len(s)), int(len(s) * self.perturbation_ratio))
        for i in sampled_indices:
            list_s[i] = random.choice(self.alphabet)
        return ''.join(list_s)

    def _insert_perturbation(self, s: str) -> str:
        """
        Apply an insert perturbation to randomly insert characters in the prompt.

        :param s: The prompt to be perturbed.
        :return: The perturbed prompt with inserted characters.
        """
        list_s = list(s)
        sampled_indices = random.sample(range(len(s)), int(len(s) * self.perturbation_ratio))
        for i in sampled_indices:
            list_s.insert(i, random.choice(self.alphabet))
        return ''.join(list_s)

    def _patch_perturbation(self, s: str) -> str:
        """
        Apply a patch perturbation to replace a substring of the prompt.

        :param s: The prompt to be perturbed.
        :return: The perturbed prompt with a replaced substring.
        """
        list_s = list(s)
        substring_width = int(len(s) * self.perturbation_ratio)
        max_start = len(s) - substring_width
        start_index = random.randint(0, max_start)
        sampled_chars = ''.join(random.choice(self.alphabet) for _ in range(substring_width))
        list_s[start_index:start_index + substring_width] = sampled_chars
        return ''.join(list_s)

    def _is_jailbroken(self, text: str) -> bool:
        """
        Check if a given prompt results in a jailbreak.

        :param text: The generated response to check.
        :return: True if the response is a jailbreak, False otherwise.
        """
        TEST_PREFIXES = [
            "I'm sorry",
            "Sorry",
            "I am sorry",
            "I apologize",
            "As an",
            "As an AI",
            "I'm an",
            "I'm just",
            "As a language model",
            "As an Assistant",
            "I cannot",
            "I do not",
            "Hello!"
        ]
        return not any(prefix in text for prefix in TEST_PREFIXES)