Source code for panda_guard.role.defenses.self_defense

from typing import Dict, List
from dataclasses import dataclass, field
from panda_guard.role.defenses import BaseDefender, BaseDefenderConfig

from panda_guard.llms import BaseLLMConfig, LLMGenerateConfig
from panda_guard.role.judges import BaseJudgeConfig, create_judge
from panda_guard.role.judges.llm_based import PairLLMJudgeConfig
from panda_guard.utils import load_class, parse_nested_config


[docs]@dataclass
class SelfDefenseDefenderConfig(BaseDefenderConfig):
    """
    Configuration for the Base Defender.

    :param defender_cls: Class of the defender.
    :type defender_cls: str
    :param defender_name: Name of the defender.
    :type defender_name: str
    :param target_llm_config: Configuration for the target language model.
    :type target_llm_config: BaseLLMConfig
    :param target_llm_gen_config: Configuration for generating output with the target LLM.
    :type target_llm_gen_config: LLMGenerateConfig
    :param judge_config: Configuration for the judge, used to assess the defense.
    :type judge_config: BaseJudgeConfig
    :param judge_score_threshold: The threshold score for the judge to decide the defense action.
    :type judge_score_threshold: int
    """

    defender_cls: str = field(default="SelfDefenseDefender")
    defender_name: str = field(default="SelfDefense")
    target_llm_config: BaseLLMConfig = field(default_factory=BaseLLMConfig)
    target_llm_gen_config: LLMGenerateConfig = field(default=None)

    judge_config: BaseJudgeConfig = field(default_factory=PairLLMJudgeConfig)
    judge_score_threshold: int = field(default=5)



[docs]class SelfDefenseDefender(BaseDefender):
    """
    SelfDefenseDefender implements the defense mechanism for the attack,
    generating responses and judging their appropriateness.
    https://arxiv.org/abs/2308.07308
    """

    def __init__(self, config: SelfDefenseDefenderConfig):
        """
        Initialize the defender with the provided configuration.

        :param config: Configuration for the SelfDefenseDefender.
        :type config: SelfDefenseDefenderConfig
        """
        # create self.target_llm, self.target_llm_gen_config
        super().__init__(config)

        judge_config_dict = config.judge_config
        judge_config_class = load_class(judge_config_dict.get("judge_cls"), "judges")
        judge_config = parse_nested_config(judge_config_class, judge_config_dict)
        self.judge = create_judge(config=judge_config)
        self.judge_score_threshold = config.judge_score_threshold

[docs]    def response(self, message: List[Dict[str, str]]):
        """
        Generate a response from the target language model.

        :param message: The input message to be processed by the target language model.
        :type message: List[Dict[str, str]]
        :return: Generated response based on the input message.
        :rtype: List[Dict[str, str]]
        """
        message = self.target_llm.generate(
            messages=message, config=self.target_llm_gen_config
        )
        return message

[docs]    def defense(self, messages=List[Dict[str, str]]):
        """
        Perform defense by generating a response, judging it, and applying defense action.

        :param messages: The input messages that are subject to defense.
        :type messages: List[Dict[str, str]]
        :return: The final defense response after evaluating the defense.
        :rtype: List[Dict[str, str]]
        """

        # without response
        naive_response = self.response(message=messages)

        # judge
        judge_score = self.judge.judge(messages=naive_response)

        if judge_score >= self.judge_score_threshold:
            naive_response[-1]["content"] = "Sorry, I CANNOT assist with that."

        return naive_response