From 83a778e845d5df683a88e08da379a83de3890bfb Mon Sep 17 00:00:00 2001 From: hannahwestra25 Date: Fri, 15 May 2026 17:35:23 -0400 Subject: [PATCH 01/12] init commit --- pyrit/scenario/__init__.py | 4 + pyrit/scenario/scenarios/adaptive/__init__.py | 26 ++ .../scenario/scenarios/adaptive/dispatcher.py | 162 ++++++++++ pyrit/scenario/scenarios/adaptive/selector.py | 187 ++++++++++++ .../scenarios/adaptive/text_adaptive.py | 275 +++++++++++++++++ .../scenarios/adaptive/test_dispatcher.py | 225 ++++++++++++++ .../scenarios/adaptive/test_selector.py | 188 ++++++++++++ .../scenarios/adaptive/test_text_adaptive.py | 282 ++++++++++++++++++ 8 files changed, 1349 insertions(+) create mode 100644 pyrit/scenario/scenarios/adaptive/__init__.py create mode 100644 pyrit/scenario/scenarios/adaptive/dispatcher.py create mode 100644 pyrit/scenario/scenarios/adaptive/selector.py create mode 100644 pyrit/scenario/scenarios/adaptive/text_adaptive.py create mode 100644 tests/unit/scenario/scenarios/adaptive/test_dispatcher.py create mode 100644 tests/unit/scenario/scenarios/adaptive/test_selector.py create mode 100644 tests/unit/scenario/scenarios/adaptive/test_text_adaptive.py diff --git a/pyrit/scenario/__init__.py b/pyrit/scenario/__init__.py index b66539543..3aac6ea7f 100644 --- a/pyrit/scenario/__init__.py +++ b/pyrit/scenario/__init__.py @@ -31,17 +31,20 @@ # Import scenario submodules directly and register them as virtual subpackages # This allows: from pyrit.scenario.airt import ContentHarms # without needing separate pyrit/scenario/airt/ directories +from pyrit.scenario.scenarios import adaptive as _adaptive_module from pyrit.scenario.scenarios import airt as _airt_module from pyrit.scenario.scenarios import benchmark as _benchmark_module from pyrit.scenario.scenarios import foundry as _foundry_module from pyrit.scenario.scenarios import garak as _garak_module +sys.modules["pyrit.scenario.adaptive"] = _adaptive_module sys.modules["pyrit.scenario.airt"] = _airt_module sys.modules["pyrit.scenario.benchmark"] = _benchmark_module sys.modules["pyrit.scenario.garak"] = _garak_module sys.modules["pyrit.scenario.foundry"] = _foundry_module # Also expose as attributes for IDE support +adaptive = _adaptive_module airt = _airt_module benchmark = _benchmark_module garak = _garak_module @@ -59,6 +62,7 @@ "ScenarioStrategy", "ScenarioIdentifier", "ScenarioResult", + "adaptive", "airt", "benchmark", "garak", diff --git a/pyrit/scenario/scenarios/adaptive/__init__.py b/pyrit/scenario/scenarios/adaptive/__init__.py new file mode 100644 index 000000000..e06e166a6 --- /dev/null +++ b/pyrit/scenario/scenarios/adaptive/__init__.py @@ -0,0 +1,26 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Adaptive scenario classes.""" + +from pyrit.scenario.scenarios.adaptive.dispatcher import ( + BANDIT_CONTEXT_LABEL, + AdaptiveDispatchAttack, +) +from pyrit.scenario.scenarios.adaptive.selector import ( + AdaptiveTechniqueSelector, + ContextExtractor, + global_context, + harm_category_context, +) +from pyrit.scenario.scenarios.adaptive.text_adaptive import TextAdaptive + +__all__ = [ + "AdaptiveDispatchAttack", + "AdaptiveTechniqueSelector", + "BANDIT_CONTEXT_LABEL", + "ContextExtractor", + "TextAdaptive", + "global_context", + "harm_category_context", +] diff --git a/pyrit/scenario/scenarios/adaptive/dispatcher.py b/pyrit/scenario/scenarios/adaptive/dispatcher.py new file mode 100644 index 000000000..ae1087a14 --- /dev/null +++ b/pyrit/scenario/scenarios/adaptive/dispatcher.py @@ -0,0 +1,162 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +""" +``AdaptiveDispatchAttack`` — an ``AttackStrategy`` that picks which inner +technique to run for each objective using an ``AdaptiveTechniqueSelector``. + +This is the execution-side counterpart to the selector. The selector decides +*which arm to pull*; the dispatcher *runs the arm*, records the outcome, and +loops up to ``max_attempts_per_objective`` times. + +The dispatcher reads a bandit-context key from +``context.memory_labels[BANDIT_CONTEXT_LABEL]``. The scenario is expected to +stamp that label per-objective (computed once at atomic-attack construction +time via a ``ContextExtractor``). When the label is missing, the global +context is used. +""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any + +from pyrit.executor.attack.core.attack_parameters import AttackParameters +from pyrit.executor.attack.core.attack_strategy import AttackContext, AttackStrategy +from pyrit.models import AttackOutcome, AttackResult +from pyrit.scenario.scenarios.adaptive.selector import ( + GLOBAL_CONTEXT, + AdaptiveTechniqueSelector, +) + +if TYPE_CHECKING: + from pyrit.prompt_target import PromptTarget + +logger = logging.getLogger(__name__) + + +BANDIT_CONTEXT_LABEL: str = "_adaptive_context" +"""Memory-label key whose value is the bandit context string for an objective.""" + +ADAPTIVE_ARM_LABEL: str = "_adaptive_arm" +ADAPTIVE_ATTEMPT_LABEL: str = "_adaptive_attempt" + + +@dataclass +class AdaptiveDispatchContext(AttackContext[AttackParameters]): + """Execution context for ``AdaptiveDispatchAttack``. + + No extra state is needed beyond what ``AttackContext`` provides; the + dispatcher reads the objective and memory labels from the base class. + """ + + +class AdaptiveDispatchAttack(AttackStrategy[AdaptiveDispatchContext, AttackResult]): + """ + Attack that delegates each attempt to one of several inner ``AttackStrategy`` + instances ("arms"), choosing per attempt via an ``AdaptiveTechniqueSelector``. + + For each objective the dispatcher loops up to ``max_attempts_per_objective`` + times. On each iteration it asks the selector which arm to try, executes + the inner attack with the objective, records the outcome on the selector, + and stops early on success. + + The selector instance is **shared by reference** with the scenario, so + learning accumulates across all objectives in a run. + """ + + def __init__( + self, + *, + objective_target: PromptTarget, + arms: dict[str, AttackStrategy[Any, AttackResult]], + selector: AdaptiveTechniqueSelector, + max_attempts_per_objective: int = 3, + ) -> None: + """ + Args: + objective_target (PromptTarget): The target the inner attacks run against. + Stored for identifier/logging parity; the dispatcher does not call + the target directly. + arms (dict[str, AttackStrategy[Any, AttackResult]]): Mapping from + technique name to a pre-built inner attack. Must be non-empty. + selector (AdaptiveTechniqueSelector): Shared bandit state. + max_attempts_per_objective (int): Maximum number of arm attempts + per objective. Must be >= 1. Defaults to 3. + + Raises: + ValueError: If ``arms`` is empty or ``max_attempts_per_objective`` < 1. + """ + if not arms: + raise ValueError("arms must contain at least one technique") + if max_attempts_per_objective < 1: + raise ValueError( + f"max_attempts_per_objective must be >= 1, got {max_attempts_per_objective}" + ) + + super().__init__( + objective_target=objective_target, + context_type=AdaptiveDispatchContext, + params_type=AttackParameters, + logger=logger, + ) + self._arms = arms + self._selector = selector + self._max_attempts = max_attempts_per_objective + + def _validate_context(self, *, context: AdaptiveDispatchContext) -> None: + if not context.objective or context.objective.isspace(): + raise ValueError("Attack objective must be provided and non-empty") + + async def _setup_async(self, *, context: AdaptiveDispatchContext) -> None: + pass + + async def _teardown_async(self, *, context: AdaptiveDispatchContext) -> None: + pass + + async def _perform_async(self, *, context: AdaptiveDispatchContext) -> AttackResult: + bandit_context = context.memory_labels.get(BANDIT_CONTEXT_LABEL, GLOBAL_CONTEXT) + arm_names = list(self._arms.keys()) + + last_result: AttackResult | None = None + trail: list[dict[str, str]] = [] + + for attempt_idx in range(self._max_attempts): + chosen = self._selector.select(context=bandit_context, arms=arm_names) + inner = self._arms[chosen] + attempt_labels = { + **context.memory_labels, + ADAPTIVE_ARM_LABEL: chosen, + ADAPTIVE_ATTEMPT_LABEL: str(attempt_idx + 1), + } + + logger.debug( + "AdaptiveDispatchAttack: attempt %d/%d context=%r arm=%r", + attempt_idx + 1, + self._max_attempts, + bandit_context, + chosen, + ) + + result = await inner.execute_async( + objective=context.objective, + memory_labels=attempt_labels, + ) + success = result.outcome == AttackOutcome.SUCCESS + self._selector.update(context=bandit_context, technique=chosen, success=success) + + trail.append({"technique": chosen, "outcome": result.outcome.value}) + last_result = result + + if success: + break + + # ``max_attempts`` is validated >= 1 above, so the loop always runs at least once. + assert last_result is not None + last_result.metadata = { + **last_result.metadata, + "adaptive_attempts": trail, + "adaptive_context": bandit_context, + } + return last_result diff --git a/pyrit/scenario/scenarios/adaptive/selector.py b/pyrit/scenario/scenarios/adaptive/selector.py new file mode 100644 index 000000000..ff2794757 --- /dev/null +++ b/pyrit/scenario/scenarios/adaptive/selector.py @@ -0,0 +1,187 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +""" +Adaptive technique selection for the ``TextAdaptive`` scenario. + +This module provides: + - ``AdaptiveTechniqueSelector``: an epsilon-greedy bandit keyed by + ``(context, technique)`` that tracks successes/attempts per arm and + picks the next technique to try. + - ``ContextExtractor``: a callable alias for deriving a context string + from a ``SeedAttackGroup``, plus two ready-made extractors: + ``global_context`` (single bucket) and ``harm_category_context`` + (first harm category, falling back to ``"_uncategorized"``). + +The selector is intentionally I/O-free and synchronous; it holds a small +mutable table that lives for the duration of a single scenario run. +""" + +from __future__ import annotations + +import random +from typing import TYPE_CHECKING, Callable, Sequence + +if TYPE_CHECKING: + from pyrit.models.seeds.seed_attack_group import SeedAttackGroup + + +ContextExtractor = Callable[["SeedAttackGroup"], str] +"""Maps a ``SeedAttackGroup`` to a bandit context key.""" + + +GLOBAL_CONTEXT: str = "_global" +UNCATEGORIZED_CONTEXT: str = "_uncategorized" + + +def global_context(_seed_attack_group: "SeedAttackGroup") -> str: + """Return a constant context so all objectives share one bandit table.""" + return GLOBAL_CONTEXT + + +def harm_category_context(seed_attack_group: "SeedAttackGroup") -> str: + """Return the first harm category on the seed group, or a fallback.""" + categories = seed_attack_group.harm_categories + if not categories: + return UNCATEGORIZED_CONTEXT + return categories[0] + + +class AdaptiveTechniqueSelector: + """ + Epsilon-greedy selector over attack techniques. + + The selector maintains a table of ``(context, technique) -> (successes, attempts)`` + counts. ``select`` returns the next technique to try for a given context, + and ``update`` records the outcome of an attempt. + + Selection uses epsilon-greedy with optimistic initialization: + - With probability ``epsilon``, pick uniformly at random from ``arms``. + - Otherwise, pick the arm with the highest estimated success rate. + The estimate is ``(successes + 1) / (attempts + 1)``, so unseen + arms look like 100% success and are explored first via tiebreak. + + When a ``(context, arm)`` cell has fewer than ``pool_threshold`` attempts, + the estimate falls back to the pooled global rate for that arm across all + contexts. This lets per-context bandits benefit from cross-context data + until they have enough local samples. Set ``pool_threshold=1`` to disable + pooling (use the local estimate as soon as any attempt is recorded). + + Note: + This class is not thread/async safe. It assumes sequential calls, + which matches the base ``Scenario._execute_scenario_async`` loop. + """ + + # Tolerance for tiebreaking in exploitation. Estimates are rational today, + # so equality works, but this guards against future estimators that may + # introduce floating-point drift. + _TIE_TOL: float = 1e-12 + + def __init__( + self, + *, + epsilon: float = 0.2, + pool_threshold: int = 3, + rng: random.Random | None = None, + ) -> None: + """ + Args: + epsilon (float): Exploration probability in [0.0, 1.0]. Defaults to 0.2. + pool_threshold (int): Minimum per-(context, arm) attempts before + the local estimate replaces the pooled-global estimate. Must + be >= 1; set to 1 to disable pooling. Defaults to 3. + rng (random.Random | None): Seedable RNG for deterministic tests. + Defaults to a fresh ``random.Random()``. + + Raises: + ValueError: If ``epsilon`` is outside [0.0, 1.0] or + ``pool_threshold`` is < 1. + """ + if not 0.0 <= epsilon <= 1.0: + raise ValueError(f"epsilon must be in [0.0, 1.0], got {epsilon}") + if pool_threshold < 1: + raise ValueError(f"pool_threshold must be >= 1, got {pool_threshold}") + + self._epsilon = epsilon + self._pool_threshold = pool_threshold + self._rng = rng if rng is not None else random.Random() + self._counts: dict[tuple[str, str], tuple[int, int]] = {} + # Per-arm pooled counts, kept in sync with ``_counts`` in ``update`` so + # ``_estimate``'s pooled-backoff branch is O(1). + self._global_counts: dict[str, tuple[int, int]] = {} + + def select(self, *, context: str, arms: Sequence[str]) -> str: + """ + Pick the next arm to try for ``context``. + + Args: + context (str): The context key (e.g. ``"_global"`` or a harm category). + arms (Sequence[str]): The candidate technique names. + + Returns: + str: The chosen arm name. + + Raises: + ValueError: If ``arms`` is empty. + """ + arm_list = list(arms) + if not arm_list: + raise ValueError("arms must contain at least one technique") + + if self._rng.random() < self._epsilon: + return self._rng.choice(arm_list) + + estimates = {arm: self._estimate(context=context, arm=arm) for arm in arm_list} + best = max(estimates.values()) + winners = [arm for arm, value in estimates.items() if value >= best - self._TIE_TOL] + return self._rng.choice(winners) + + def update(self, *, context: str, technique: str, success: bool) -> None: + """ + Record the outcome of an attempt. + + Args: + context (str): The context key the decision was made under. + technique (str): The arm that was tried. + success (bool): Whether the attempt succeeded. + """ + successes, attempts = self._counts.get((context, technique), (0, 0)) + attempts += 1 + if success: + successes += 1 + self._counts[(context, technique)] = (successes, attempts) + + global_successes, global_attempts = self._global_counts.get(technique, (0, 0)) + global_attempts += 1 + if success: + global_successes += 1 + self._global_counts[technique] = (global_successes, global_attempts) + + def success_rate(self, *, context: str, technique: str) -> float: + """ + Return the smoothed success-rate estimate for an arm in a context. + + This is the same value used internally for exploitation decisions. + """ + return self._estimate(context=context, arm=technique) + + def counts(self, *, context: str, technique: str) -> tuple[int, int]: + """Return raw ``(successes, attempts)`` for a ``(context, technique)`` cell.""" + return self._counts.get((context, technique), (0, 0)) + + def snapshot(self) -> dict[tuple[str, str], tuple[int, int]]: + """Return a shallow copy of the full counts table (for logging/debug).""" + return dict(self._counts) + + def _estimate(self, *, context: str, arm: str) -> float: + """ + Smoothed success-rate estimate for ``(context, arm)``. + + Below ``pool_threshold`` local attempts, the estimate uses the + pooled-global success rate for the arm across all contexts. + """ + local_s, local_n = self._counts.get((context, arm), (0, 0)) + if local_n >= self._pool_threshold: + return (local_s + 1) / (local_n + 1) + global_s, global_n = self._global_counts.get(arm, (0, 0)) + return (global_s + 1) / (global_n + 1) diff --git a/pyrit/scenario/scenarios/adaptive/text_adaptive.py b/pyrit/scenario/scenarios/adaptive/text_adaptive.py new file mode 100644 index 000000000..2fa97b706 --- /dev/null +++ b/pyrit/scenario/scenarios/adaptive/text_adaptive.py @@ -0,0 +1,275 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +""" +TextAdaptive scenario — picks attack techniques per-objective using an +epsilon-greedy bandit informed by observed per-run success rates. + +Unlike static scenarios (which run every selected technique against every +objective), TextAdaptive runs **up to** ``max_attempts_per_objective`` +techniques per objective and stops early when one succeeds. Which technique +to try next is decided by an ``AdaptiveTechniqueSelector`` whose Q-values are +updated after every attempt. + +The set of available "arms" comes from the selected scenario strategies, so +``--strategies single_turn`` restricts the bandit to single-turn techniques, +etc. The default selector uses a single global context; pass a different +``context_extractor`` (e.g., ``harm_category_context``) to partition Q-values +per category. +""" + +from __future__ import annotations + +import logging +import random +import uuid +from typing import TYPE_CHECKING, ClassVar, cast + +from pyrit.common import apply_defaults +from pyrit.executor.attack import AttackScoringConfig +from pyrit.registry.tag_query import TagQuery +from pyrit.scenario.core.dataset_configuration import DatasetConfiguration +from pyrit.scenario.core.scenario import BaselinePolicy, Scenario +from pyrit.scenario.core.scenario_strategy import ScenarioStrategy +from pyrit.scenario.scenarios.adaptive.dispatcher import ( + BANDIT_CONTEXT_LABEL, + AdaptiveDispatchAttack, +) +from pyrit.scenario.scenarios.adaptive.selector import ( + AdaptiveTechniqueSelector, + ContextExtractor, + global_context, +) + +if TYPE_CHECKING: + from pyrit.executor.attack.core.attack_strategy import AttackStrategy + from pyrit.models import SeedAttackGroup + from pyrit.scenario.core.atomic_attack import AtomicAttack + from pyrit.score import TrueFalseScorer + +logger = logging.getLogger(__name__) + + +def _build_text_adaptive_strategy() -> type[ScenarioStrategy]: + """Build the strategy enum from the core scenario-techniques catalog.""" + from pyrit.registry.object_registries.attack_technique_registry import ( + AttackTechniqueRegistry, + ) + from pyrit.scenario.core.scenario_techniques import SCENARIO_TECHNIQUES + + return AttackTechniqueRegistry.build_strategy_class_from_specs( # type: ignore[return-value, ty:invalid-return-type] + class_name="TextAdaptiveStrategy", + specs=SCENARIO_TECHNIQUES, + aggregate_tags={ + "default": TagQuery.any_of("default"), + "single_turn": TagQuery.any_of("single_turn"), + "multi_turn": TagQuery.any_of("multi_turn"), + }, + ) + + +class TextAdaptive(Scenario): + """ + Adaptive text-attack scenario that selects techniques per-objective using + an epsilon-greedy bandit over the set of selected strategies. + + The bandit: + - Picks an arm uniformly at random with probability ``epsilon``. + - Otherwise exploits the highest observed success rate. Unseen arms + have an optimistic prior so the first few objectives effectively + round-robin through every available technique. + - Pools across contexts when a context has fewer than + ``pool_threshold`` observations for an arm. + + A baseline ``PromptSendingAttack`` is **not** prepended — every objective + runs through the dispatcher, and ``prompt_sending`` participates as one of + the bandit's arms. + """ + + VERSION: int = 1 + BASELINE_POLICY: ClassVar[BaselinePolicy] = BaselinePolicy.Forbidden + _cached_strategy_class: ClassVar[type[ScenarioStrategy] | None] = None + + # ------------------------------------------------------------------ # + # Required class-method overrides # + # ------------------------------------------------------------------ # + + @classmethod + def get_strategy_class(cls) -> type[ScenarioStrategy]: + if cls._cached_strategy_class is None: + cls._cached_strategy_class = _build_text_adaptive_strategy() + return cls._cached_strategy_class + + @classmethod + def get_default_strategy(cls) -> ScenarioStrategy: + strategy_class = cls.get_strategy_class() + return strategy_class("default") + + @classmethod + def required_datasets(cls) -> list[str]: + return [ + "airt_hate", + "airt_fairness", + "airt_violence", + "airt_sexual", + "airt_harassment", + "airt_misinformation", + "airt_leakage", + ] + + @classmethod + def default_dataset_config(cls) -> DatasetConfiguration: + return DatasetConfiguration(dataset_names=cls.required_datasets(), max_dataset_size=4) + + # ------------------------------------------------------------------ # + # Constructor # + # ------------------------------------------------------------------ # + + @apply_defaults + def __init__( + self, + *, + objective_scorer: TrueFalseScorer | None = None, + epsilon: float = 0.2, + pool_threshold: int = 3, + max_attempts_per_objective: int = 3, + seed: int | None = None, + context_extractor: ContextExtractor = global_context, + scenario_result_id: str | None = None, + ) -> None: + """ + Args: + objective_scorer (TrueFalseScorer | None): Scorer used to judge each + response. Defaults to the composite scorer built from the base class. + epsilon (float): Exploration probability for the bandit. Defaults to 0.2. + pool_threshold (int): Minimum per-(context, arm) attempts before the + local estimate overrides the pooled-global estimate. Set to 1 to + disable pooling. Defaults to 3. + max_attempts_per_objective (int): Maximum techniques tried per + objective before giving up. Defaults to 3. + seed (int | None): RNG seed for deterministic bandit decisions. + Defaults to ``None`` (non-deterministic). + context_extractor (ContextExtractor): Function mapping a + ``SeedAttackGroup`` to a bandit context key. Defaults to + ``global_context`` (one shared bandit table). Use + ``harm_category_context`` to partition Q-values by harm category. + scenario_result_id (str | None): ID of an existing ``ScenarioResult`` + to resume. + """ + if not objective_scorer: + objective_scorer = self._get_default_objective_scorer() + + self._epsilon = epsilon + self._pool_threshold = pool_threshold + self._max_attempts_per_objective = max_attempts_per_objective + self._seed = seed + self._context_extractor = context_extractor + + super().__init__( + version=self.VERSION, + strategy_class=self.get_strategy_class(), + objective_scorer=objective_scorer, + scenario_result_id=scenario_result_id, + ) + + # ------------------------------------------------------------------ # + # Override atomic-attack construction # + # ------------------------------------------------------------------ # + + async def _get_atomic_attacks_async(self) -> list[AtomicAttack]: + """ + Build one ``AtomicAttack`` per objective, all sharing a single + ``AdaptiveDispatchAttack`` (and therefore a single + ``AdaptiveTechniqueSelector``). + + This is the bandit's "single working memory shared across objectives" + plumbing: each per-objective ``AtomicAttack`` consults and updates the + same selector via the same dispatcher instance. + """ + if self._objective_target is None: + raise ValueError( + "Scenario not properly initialized. Call await scenario.initialize_async() before running." + ) + + from pyrit.scenario.core.atomic_attack import AtomicAttack + + selected_arms = sorted({s.value for s in self._scenario_strategies}) + factories = self._get_attack_technique_factories() + + # Build each arm's inner attack once and reuse across all objectives. + scoring_config = AttackScoringConfig(objective_scorer=cast("TrueFalseScorer", self._objective_scorer)) + arms: dict[str, AttackStrategy] = {} + for technique_name in selected_arms: + factory = factories.get(technique_name) + if factory is None: + logger.warning(f"No factory for technique '{technique_name}', skipping.") + continue + technique = factory.create( + objective_target=self._objective_target, + attack_scoring_config=scoring_config, + ) + arms[technique_name] = technique.attack + + if not arms: + raise ValueError( + "TextAdaptive: no usable techniques after resolving strategies. " + "Check the --strategies selection." + ) + + selector = AdaptiveTechniqueSelector( + epsilon=self._epsilon, + pool_threshold=self._pool_threshold, + rng=random.Random(self._seed), + ) + dispatcher = AdaptiveDispatchAttack( + objective_target=self._objective_target, + arms=arms, + selector=selector, + max_attempts_per_objective=self._max_attempts_per_objective, + ) + # Stash for tests / debugging; not part of the public API. + self._selector = selector + self._dispatcher = dispatcher + + seed_groups_by_dataset = self._dataset_config.get_seed_attack_groups() + atomic_attacks: list[AtomicAttack] = [] + for dataset_name, seed_groups in seed_groups_by_dataset.items(): + for seed_group in seed_groups: + atomic_attacks.append( + self._build_atomic_for_seed_group( + dataset_name=dataset_name, + seed_group=seed_group, + dispatcher=dispatcher, + ) + ) + + return atomic_attacks + + def _build_atomic_for_seed_group( + self, + *, + dataset_name: str, + seed_group: SeedAttackGroup, + dispatcher: AdaptiveDispatchAttack, + ) -> AtomicAttack: + from pyrit.scenario.core.atomic_attack import AtomicAttack + from pyrit.scenario.core.attack_technique import AttackTechnique + + bandit_context = self._context_extractor(seed_group) + # Use the objective's id when available so resume keys are stable across + # runs that re-fetch the same seed groups; fall back to a random uuid. + objective_id = seed_group.objective.id if seed_group.objective.id else uuid.uuid4() + atomic_attack_name = f"adaptive_{dataset_name}_{objective_id}" + + memory_labels = { + **self._memory_labels, + BANDIT_CONTEXT_LABEL: bandit_context, + } + return AtomicAttack( + atomic_attack_name=atomic_attack_name, + attack_technique=AttackTechnique(attack=dispatcher), + seed_groups=[seed_group], + objective_scorer=cast("TrueFalseScorer", self._objective_scorer), + memory_labels=memory_labels, + display_group=dataset_name, + ) diff --git a/tests/unit/scenario/scenarios/adaptive/test_dispatcher.py b/tests/unit/scenario/scenarios/adaptive/test_dispatcher.py new file mode 100644 index 000000000..68051f3d5 --- /dev/null +++ b/tests/unit/scenario/scenarios/adaptive/test_dispatcher.py @@ -0,0 +1,225 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import random +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from pyrit.executor.attack.core.attack_parameters import AttackParameters +from pyrit.models import AttackOutcome, AttackResult +from pyrit.scenario.scenarios.adaptive.dispatcher import ( + ADAPTIVE_ARM_LABEL, + ADAPTIVE_ATTEMPT_LABEL, + BANDIT_CONTEXT_LABEL, + AdaptiveDispatchAttack, + AdaptiveDispatchContext, +) +from pyrit.scenario.scenarios.adaptive.selector import ( + GLOBAL_CONTEXT, + AdaptiveTechniqueSelector, +) + + +def _make_inner_attack(*, name: str, outcomes: list[AttackOutcome]) -> MagicMock: + """Build a mocked inner attack whose execute_async returns the given outcomes in order.""" + inner = MagicMock(name=name) + results = [ + AttackResult( + conversation_id=f"conv-{name}-{i}", + objective="obj", + outcome=outcome, + ) + for i, outcome in enumerate(outcomes) + ] + inner.execute_async = AsyncMock(side_effect=results) + return inner + + +def _make_context(*, objective: str = "obj", labels: dict[str, str] | None = None) -> AdaptiveDispatchContext: + return AdaptiveDispatchContext(params=AttackParameters(objective=objective, memory_labels=labels or {})) + + +@pytest.fixture +def selector() -> AdaptiveTechniqueSelector: + # epsilon=0 makes selection deterministic given the table. + return AdaptiveTechniqueSelector(epsilon=0.0, pool_threshold=1, rng=random.Random(0)) + + +@pytest.fixture +def target() -> MagicMock: + return MagicMock(name="objective_target") + + +class TestInit: + @pytest.mark.usefixtures("patch_central_database") + def test_init_rejects_empty_arms(self, target, selector): + with pytest.raises(ValueError, match="arms"): + AdaptiveDispatchAttack(objective_target=target, arms={}, selector=selector) + + @pytest.mark.parametrize("bad_max", [0, -1]) + @pytest.mark.usefixtures("patch_central_database") + def test_init_rejects_invalid_max_attempts(self, target, selector, bad_max): + with pytest.raises(ValueError, match="max_attempts_per_objective"): + AdaptiveDispatchAttack( + objective_target=target, + arms={"a": _make_inner_attack(name="a", outcomes=[AttackOutcome.SUCCESS])}, + selector=selector, + max_attempts_per_objective=bad_max, + ) + + +@pytest.mark.usefixtures("patch_central_database") +class TestPerform: + async def test_stops_on_first_success(self, target, selector): + a = _make_inner_attack(name="a", outcomes=[AttackOutcome.SUCCESS]) + b = _make_inner_attack(name="b", outcomes=[AttackOutcome.SUCCESS]) + dispatcher = AdaptiveDispatchAttack( + objective_target=target, + arms={"a": a, "b": b}, + selector=selector, + max_attempts_per_objective=5, + ) + + result = await dispatcher._perform_async(context=_make_context()) + + assert result.outcome == AttackOutcome.SUCCESS + total_calls = a.execute_async.call_count + b.execute_async.call_count + assert total_calls == 1 + + async def test_retries_until_max_attempts_on_failure(self, target, selector): + a = _make_inner_attack(name="a", outcomes=[AttackOutcome.FAILURE] * 3) + b = _make_inner_attack(name="b", outcomes=[AttackOutcome.FAILURE] * 3) + dispatcher = AdaptiveDispatchAttack( + objective_target=target, + arms={"a": a, "b": b}, + selector=selector, + max_attempts_per_objective=3, + ) + + result = await dispatcher._perform_async(context=_make_context()) + + assert result.outcome == AttackOutcome.FAILURE + total_calls = a.execute_async.call_count + b.execute_async.call_count + assert total_calls == 3 + + async def test_updates_selector_on_each_attempt(self, target, selector): + a = _make_inner_attack(name="a", outcomes=[AttackOutcome.FAILURE, AttackOutcome.SUCCESS]) + b = _make_inner_attack(name="b", outcomes=[AttackOutcome.SUCCESS]) + dispatcher = AdaptiveDispatchAttack( + objective_target=target, + arms={"a": a, "b": b}, + selector=selector, + max_attempts_per_objective=3, + ) + + await dispatcher._perform_async(context=_make_context()) + + # Total attempts across arms must equal sum of selector counts. + total_attempts = sum( + selector.counts(context=GLOBAL_CONTEXT, technique=t)[1] for t in ("a", "b") + ) + total_calls = a.execute_async.call_count + b.execute_async.call_count + assert total_attempts == total_calls + + async def test_passes_objective_to_inner(self, target, selector): + a = _make_inner_attack(name="a", outcomes=[AttackOutcome.SUCCESS]) + dispatcher = AdaptiveDispatchAttack( + objective_target=target, + arms={"a": a}, + selector=selector, + ) + + await dispatcher._perform_async(context=_make_context(objective="my-goal")) + + kwargs = a.execute_async.call_args.kwargs + assert kwargs["objective"] == "my-goal" + + async def test_attaches_arm_and_attempt_labels(self, target, selector): + a = _make_inner_attack(name="a", outcomes=[AttackOutcome.SUCCESS]) + dispatcher = AdaptiveDispatchAttack( + objective_target=target, + arms={"a": a}, + selector=selector, + ) + + await dispatcher._perform_async(context=_make_context(labels={"foo": "bar"})) + + labels = a.execute_async.call_args.kwargs["memory_labels"] + assert labels["foo"] == "bar" # caller labels preserved + assert labels[ADAPTIVE_ARM_LABEL] == "a" + assert labels[ADAPTIVE_ATTEMPT_LABEL] == "1" + + async def test_uses_bandit_context_from_label(self, target, selector): + # Two arms; one has been heavily rewarded under context "violence" only. + a = _make_inner_attack(name="a", outcomes=[AttackOutcome.SUCCESS]) + b = _make_inner_attack(name="b", outcomes=[AttackOutcome.SUCCESS]) + for _ in range(5): + selector.update(context="violence", technique="b", success=True) + for _ in range(5): + selector.update(context="violence", technique="a", success=False) + + dispatcher = AdaptiveDispatchAttack( + objective_target=target, + arms={"a": a, "b": b}, + selector=selector, + ) + ctx = _make_context(labels={BANDIT_CONTEXT_LABEL: "violence"}) + await dispatcher._perform_async(context=ctx) + + # Exploit should have picked "b" first. + assert b.execute_async.call_count == 1 + assert a.execute_async.call_count == 0 + + async def test_falls_back_to_global_context_when_label_missing(self, target, selector): + a = _make_inner_attack(name="a", outcomes=[AttackOutcome.SUCCESS]) + dispatcher = AdaptiveDispatchAttack( + objective_target=target, + arms={"a": a}, + selector=selector, + ) + await dispatcher._perform_async(context=_make_context(labels={})) + + # The global context bucket received the update. + assert selector.counts(context=GLOBAL_CONTEXT, technique="a") == (1, 1) + + async def test_metadata_records_adaptive_trail(self, target, selector): + # Arm "a" fails on the first attempt then succeeds; verify the trail + # captures both attempts in order. + a = _make_inner_attack(name="a", outcomes=[AttackOutcome.FAILURE, AttackOutcome.SUCCESS]) + dispatcher = AdaptiveDispatchAttack( + objective_target=target, + arms={"a": a}, + selector=selector, + max_attempts_per_objective=3, + ) + result = await dispatcher._perform_async(context=_make_context()) + + trail = result.metadata["adaptive_attempts"] + assert trail == [ + {"technique": "a", "outcome": "failure"}, + {"technique": "a", "outcome": "success"}, + ] + assert result.metadata["adaptive_context"] == GLOBAL_CONTEXT + + +@pytest.mark.usefixtures("patch_central_database") +class TestValidate: + @pytest.mark.parametrize("bad_objective", ["", " ", "\n\t"]) + def test_validate_rejects_empty_objective(self, target, selector, bad_objective): + dispatcher = AdaptiveDispatchAttack( + objective_target=target, + arms={"a": _make_inner_attack(name="a", outcomes=[AttackOutcome.SUCCESS])}, + selector=selector, + ) + with pytest.raises(ValueError, match="objective"): + dispatcher._validate_context(context=_make_context(objective=bad_objective)) + + def test_validate_accepts_normal_objective(self, target, selector): + dispatcher = AdaptiveDispatchAttack( + objective_target=target, + arms={"a": _make_inner_attack(name="a", outcomes=[AttackOutcome.SUCCESS])}, + selector=selector, + ) + # Does not raise. + dispatcher._validate_context(context=_make_context(objective="ok")) diff --git a/tests/unit/scenario/scenarios/adaptive/test_selector.py b/tests/unit/scenario/scenarios/adaptive/test_selector.py new file mode 100644 index 000000000..7b5c75958 --- /dev/null +++ b/tests/unit/scenario/scenarios/adaptive/test_selector.py @@ -0,0 +1,188 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import random +from unittest.mock import MagicMock + +import pytest + +from pyrit.scenario.scenarios.adaptive.selector import ( + GLOBAL_CONTEXT, + UNCATEGORIZED_CONTEXT, + AdaptiveTechniqueSelector, + global_context, + harm_category_context, +) + + +ARMS = ["a", "b", "c", "d"] + + +def _seeded_selector(*, epsilon: float = 0.0, pool_threshold: int = 3, seed: int = 0) -> AdaptiveTechniqueSelector: + return AdaptiveTechniqueSelector( + epsilon=epsilon, + pool_threshold=pool_threshold, + rng=random.Random(seed), + ) + + +class TestAdaptiveTechniqueSelectorInit: + def test_init_defaults(self): + selector = AdaptiveTechniqueSelector() + assert selector.snapshot() == {} + + @pytest.mark.parametrize("bad_epsilon", [-0.1, 1.1, 2.0, -1.0]) + def test_init_rejects_out_of_range_epsilon(self, bad_epsilon): + with pytest.raises(ValueError, match="epsilon"): + AdaptiveTechniqueSelector(epsilon=bad_epsilon) + + def test_init_rejects_pool_threshold_below_one(self): + with pytest.raises(ValueError, match="pool_threshold"): + AdaptiveTechniqueSelector(pool_threshold=0) + with pytest.raises(ValueError, match="pool_threshold"): + AdaptiveTechniqueSelector(pool_threshold=-1) + + +class TestAdaptiveTechniqueSelectorSelect: + def test_select_empty_arms_raises(self): + selector = _seeded_selector() + with pytest.raises(ValueError, match="arms"): + selector.select(context=GLOBAL_CONTEXT, arms=[]) + + def test_select_all_unseen_ties_resolved_randomly(self): + # With epsilon=0 and an empty table, every arm has estimate 1/1=1.0, + # so the result is the seeded random tiebreak. Different seeds should + # be able to produce different winners. + winners = { + _seeded_selector(seed=s).select(context=GLOBAL_CONTEXT, arms=ARMS) + for s in range(50) + } + assert len(winners) > 1 + assert winners.issubset(set(ARMS)) + + def test_select_exploits_clear_winner(self): + selector = _seeded_selector(pool_threshold=1) + # Give "b" a track record of pure success, others pure failure. + for _ in range(5): + selector.update(context=GLOBAL_CONTEXT, technique="b", success=True) + for arm in ("a", "c", "d"): + for _ in range(5): + selector.update(context=GLOBAL_CONTEXT, technique=arm, success=False) + + # With epsilon=0, every selection must exploit the winner. + for _ in range(20): + assert selector.select(context=GLOBAL_CONTEXT, arms=ARMS) == "b" + + def test_select_epsilon_one_is_pure_random(self): + selector = _seeded_selector(epsilon=1.0) + # Bias the table heavily toward "a"; with epsilon=1 it must still be ignored. + for _ in range(20): + selector.update(context=GLOBAL_CONTEXT, technique="a", success=True) + + picks = [selector.select(context=GLOBAL_CONTEXT, arms=ARMS) for _ in range(200)] + assert set(picks) == set(ARMS) + + def test_select_epsilon_zero_never_explores(self): + selector = _seeded_selector(epsilon=0.0, pool_threshold=1) + for _ in range(3): + selector.update(context=GLOBAL_CONTEXT, technique="a", success=True) + # Make the other arms tried-and-failed so they fall below "a"'s estimate; + # unseen arms would otherwise tie at the optimistic 1.0. + for arm in ("b", "c", "d"): + selector.update(context=GLOBAL_CONTEXT, technique=arm, success=False) + for _ in range(50): + assert selector.select(context=GLOBAL_CONTEXT, arms=ARMS) == "a" + + def test_select_cold_start_round_robins(self): + # Optimistic init + epsilon=0: untried arms tie at 1.0 and beat tried-and-failed + # arms (1/2 = 0.5). So the first failures push each arm to "tried" exactly once + # before any arm gets tried twice. + selector = _seeded_selector(pool_threshold=1) + tried: list[str] = [] + for _ in range(len(ARMS)): + arm = selector.select(context=GLOBAL_CONTEXT, arms=ARMS) + tried.append(arm) + selector.update(context=GLOBAL_CONTEXT, technique=arm, success=False) + assert sorted(tried) == sorted(ARMS) + + +class TestAdaptiveTechniqueSelectorUpdate: + def test_update_accumulates_counts(self): + selector = _seeded_selector() + selector.update(context="ctx", technique="a", success=True) + selector.update(context="ctx", technique="a", success=False) + selector.update(context="ctx", technique="a", success=True) + assert selector.counts(context="ctx", technique="a") == (2, 3) + + def test_update_separate_contexts_are_independent(self): + selector = _seeded_selector() + selector.update(context="x", technique="a", success=True) + selector.update(context="y", technique="a", success=False) + assert selector.counts(context="x", technique="a") == (1, 1) + assert selector.counts(context="y", technique="a") == (0, 1) + + def test_counts_default_zero_for_unseen(self): + selector = _seeded_selector() + assert selector.counts(context="missing", technique="missing") == (0, 0) + + def test_update_keeps_pooled_global_counts_in_sync(self): + # Pooled-global counts back the O(1) pooled-backoff branch in _estimate. + # They must aggregate across contexts for a given arm. + selector = _seeded_selector(pool_threshold=5) + selector.update(context="x", technique="a", success=True) + selector.update(context="y", technique="a", success=False) + selector.update(context="z", technique="a", success=True) + selector.update(context="x", technique="b", success=True) + + # Below the local threshold, _estimate must use the pooled-global rate. + # arm "a": 2 successes / 3 attempts -> (2+1)/(3+1) = 0.75 + assert selector.success_rate(context="new_ctx", technique="a") == pytest.approx(0.75) + # arm "b": 1/1 -> (1+1)/(1+1) = 1.0 + assert selector.success_rate(context="new_ctx", technique="b") == pytest.approx(1.0) + # Unseen arm "c" -> (0+1)/(0+1) = 1.0 + assert selector.success_rate(context="new_ctx", technique="c") == pytest.approx(1.0) + + +class TestAdaptiveTechniqueSelectorEstimate: + def test_success_rate_unseen_is_one(self): + # Optimistic init: (0 + 1) / (0 + 1) = 1.0 + selector = _seeded_selector() + assert selector.success_rate(context="ctx", technique="a") == pytest.approx(1.0) + + def test_success_rate_local_when_above_threshold(self): + selector = _seeded_selector(pool_threshold=2) + for _ in range(3): + selector.update(context="ctx", technique="a", success=True) + # (3 + 1) / (3 + 1) = 1.0 + assert selector.success_rate(context="ctx", technique="a") == pytest.approx(1.0) + + def test_success_rate_pools_when_below_threshold(self): + selector = _seeded_selector(pool_threshold=5) + # Local cell has only 1 attempt (below threshold). + selector.update(context="ctx", technique="a", success=False) + # Other contexts have plenty of data for arm "a". + for _ in range(10): + selector.update(context="other", technique="a", success=True) + # Pooled estimate = (10 + 0 + 1) / (10 + 1 + 1) = 11/12. + assert selector.success_rate(context="ctx", technique="a") == pytest.approx(11 / 12) + + +class TestContextExtractors: + def test_global_context_is_constant(self): + sg = MagicMock() + assert global_context(sg) == GLOBAL_CONTEXT + + def test_harm_category_context_uses_first_category(self): + sg = MagicMock() + sg.harm_categories = ["violence", "hate"] + assert harm_category_context(sg) == "violence" + + def test_harm_category_context_falls_back_when_empty(self): + sg = MagicMock() + sg.harm_categories = [] + assert harm_category_context(sg) == UNCATEGORIZED_CONTEXT + + def test_harm_category_context_falls_back_when_none(self): + sg = MagicMock() + sg.harm_categories = None + assert harm_category_context(sg) == UNCATEGORIZED_CONTEXT diff --git a/tests/unit/scenario/scenarios/adaptive/test_text_adaptive.py b/tests/unit/scenario/scenarios/adaptive/test_text_adaptive.py new file mode 100644 index 000000000..82796def4 --- /dev/null +++ b/tests/unit/scenario/scenarios/adaptive/test_text_adaptive.py @@ -0,0 +1,282 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Tests for the ``TextAdaptive`` scenario.""" + +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +import pytest + +from pyrit.identifiers import ComponentIdentifier +from pyrit.models import SeedAttackGroup, SeedObjective +from pyrit.prompt_target import PromptTarget +from pyrit.registry.object_registries.attack_technique_registry import AttackTechniqueRegistry +from pyrit.scenario.core.dataset_configuration import DatasetConfiguration +from pyrit.scenario.core.scenario import BaselinePolicy +from pyrit.scenario.scenarios.adaptive.dispatcher import ( + BANDIT_CONTEXT_LABEL, + AdaptiveDispatchAttack, +) +from pyrit.scenario.scenarios.adaptive.selector import ( + GLOBAL_CONTEXT, + AdaptiveTechniqueSelector, + harm_category_context, +) +from pyrit.scenario.scenarios.adaptive.text_adaptive import TextAdaptive +from pyrit.score import TrueFalseScorer + + +_MOCK_MANY_SHOT_EXAMPLES = [{"question": f"q{i}", "answer": f"a{i}"} for i in range(100)] + + +def _mock_id(name: str) -> ComponentIdentifier: + return ComponentIdentifier(class_name=name, class_module="test") + + +@pytest.fixture +def mock_objective_target() -> MagicMock: + mock = MagicMock(spec=PromptTarget) + mock.get_identifier.return_value = _mock_id("MockObjectiveTarget") + return mock + + +@pytest.fixture +def mock_objective_scorer() -> MagicMock: + mock = MagicMock(spec=TrueFalseScorer) + mock.get_identifier.return_value = _mock_id("MockObjectiveScorer") + return mock + + +@pytest.fixture(autouse=True) +def reset_technique_registry(): + """Reset registries and the cached strategy class between tests.""" + from pyrit.registry import TargetRegistry + + AttackTechniqueRegistry.reset_instance() + TargetRegistry.reset_instance() + TextAdaptive._cached_strategy_class = None + yield + AttackTechniqueRegistry.reset_instance() + TargetRegistry.reset_instance() + TextAdaptive._cached_strategy_class = None + + +@pytest.fixture(autouse=True) +def patch_many_shot_load(): + with patch( + "pyrit.executor.attack.single_turn.many_shot_jailbreak.load_many_shot_jailbreaking_dataset", + return_value=_MOCK_MANY_SHOT_EXAMPLES, + ): + yield + + +@pytest.fixture +def mock_runtime_env(): + with patch.dict( + "os.environ", + { + "OPENAI_CHAT_ENDPOINT": "https://test.openai.azure.com/", + "OPENAI_CHAT_KEY": "test-key", + "OPENAI_CHAT_MODEL": "gpt-4", + }, + ): + yield + + +def _make_seed_group(*, value: str, harm_categories: list[str] | None = None) -> SeedAttackGroup: + return SeedAttackGroup(seeds=[SeedObjective(value=value, harm_categories=harm_categories)]) + + +FIXTURES = ["patch_central_database", "mock_runtime_env"] + + +@pytest.mark.usefixtures(*FIXTURES) +class TestTextAdaptiveBasics: + def test_version(self): + assert TextAdaptive.VERSION == 1 + + def test_baseline_forbidden(self): + assert TextAdaptive.BASELINE_POLICY is BaselinePolicy.Forbidden + + def test_default_dataset_config(self): + config = TextAdaptive.default_dataset_config() + assert isinstance(config, DatasetConfiguration) + assert config.max_dataset_size == 4 + + def test_required_datasets_non_empty(self): + assert len(TextAdaptive.required_datasets()) > 0 + + def test_get_strategy_class_is_cached(self): + cls_a = TextAdaptive.get_strategy_class() + cls_b = TextAdaptive.get_strategy_class() + assert cls_a is cls_b + + def test_get_default_strategy(self): + strat = TextAdaptive.get_default_strategy() + # The default aggregate must resolve to something runnable. + assert strat is not None + + @patch("pyrit.scenario.core.scenario.Scenario._get_default_objective_scorer") + def test_init_stores_bandit_params(self, mock_get_scorer, mock_objective_scorer): + mock_get_scorer.return_value = mock_objective_scorer + scenario = TextAdaptive( + epsilon=0.4, + pool_threshold=5, + max_attempts_per_objective=7, + seed=42, + ) + assert scenario._epsilon == 0.4 + assert scenario._pool_threshold == 5 + assert scenario._max_attempts_per_objective == 7 + assert scenario._seed == 42 + + +@pytest.mark.usefixtures(*FIXTURES) +class TestTextAdaptiveAtomicAttacks: + """Tests for ``_get_atomic_attacks_async`` overriding.""" + + async def _build_scenario_and_attacks( + self, + *, + mock_objective_target, + mock_objective_scorer, + seed_groups: dict[str, list[SeedAttackGroup]], + **scenario_kwargs, + ): + with patch.object(DatasetConfiguration, "get_seed_attack_groups", return_value=seed_groups): + scenario = TextAdaptive( + objective_scorer=mock_objective_scorer, + **scenario_kwargs, + ) + await scenario.initialize_async( + objective_target=mock_objective_target, + include_baseline=False, + ) + return scenario, await scenario._get_atomic_attacks_async() + + async def test_one_atomic_per_objective(self, mock_objective_target, mock_objective_scorer): + groups = { + "violence": [ + _make_seed_group(value="obj-v1", harm_categories=["violence"]), + _make_seed_group(value="obj-v2", harm_categories=["violence"]), + ], + "hate": [ + _make_seed_group(value="obj-h1", harm_categories=["hate"]), + ], + } + _scenario, attacks = await self._build_scenario_and_attacks( + mock_objective_target=mock_objective_target, + mock_objective_scorer=mock_objective_scorer, + seed_groups=groups, + ) + assert len(attacks) == 3 + for atomic in attacks: + # Each atomic carries exactly one seed group. + assert len(atomic.objectives) == 1 + + async def test_all_atomics_share_one_dispatcher(self, mock_objective_target, mock_objective_scorer): + groups = { + "violence": [ + _make_seed_group(value="obj-v1", harm_categories=["violence"]), + _make_seed_group(value="obj-v2", harm_categories=["violence"]), + ], + } + scenario, attacks = await self._build_scenario_and_attacks( + mock_objective_target=mock_objective_target, + mock_objective_scorer=mock_objective_scorer, + seed_groups=groups, + ) + dispatchers = {atomic._attack_technique.attack for atomic in attacks} + assert len(dispatchers) == 1 + assert isinstance(next(iter(dispatchers)), AdaptiveDispatchAttack) + assert isinstance(scenario._selector, AdaptiveTechniqueSelector) + + async def test_global_context_label_when_using_global_extractor( + self, mock_objective_target, mock_objective_scorer + ): + groups = { + "violence": [_make_seed_group(value="obj-1", harm_categories=["violence"])], + "hate": [_make_seed_group(value="obj-2", harm_categories=["hate"])], + } + _scenario, attacks = await self._build_scenario_and_attacks( + mock_objective_target=mock_objective_target, + mock_objective_scorer=mock_objective_scorer, + seed_groups=groups, + ) + for atomic in attacks: + assert atomic._memory_labels[BANDIT_CONTEXT_LABEL] == GLOBAL_CONTEXT + + async def test_harm_category_extractor_partitions_labels( + self, mock_objective_target, mock_objective_scorer + ): + groups = { + "violence": [_make_seed_group(value="obj-v", harm_categories=["violence"])], + "hate": [_make_seed_group(value="obj-h", harm_categories=["hate"])], + "uncat": [_make_seed_group(value="obj-u", harm_categories=None)], + } + _scenario, attacks = await self._build_scenario_and_attacks( + mock_objective_target=mock_objective_target, + mock_objective_scorer=mock_objective_scorer, + seed_groups=groups, + context_extractor=harm_category_context, + ) + contexts = {atomic._memory_labels[BANDIT_CONTEXT_LABEL] for atomic in attacks} + # Each objective gets its own context bucket from harm_category_context. + assert contexts == {"violence", "hate", "_uncategorized"} + + async def test_atomic_names_are_unique(self, mock_objective_target, mock_objective_scorer): + groups = { + "violence": [ + _make_seed_group(value=f"obj-{i}", harm_categories=["violence"]) for i in range(5) + ], + } + _scenario, attacks = await self._build_scenario_and_attacks( + mock_objective_target=mock_objective_target, + mock_objective_scorer=mock_objective_scorer, + seed_groups=groups, + ) + names = [atomic.atomic_attack_name for atomic in attacks] + assert len(set(names)) == len(names) + + async def test_display_group_is_dataset_name(self, mock_objective_target, mock_objective_scorer): + groups = { + "violence": [_make_seed_group(value="obj-v", harm_categories=["violence"])], + "hate": [_make_seed_group(value="obj-h", harm_categories=["hate"])], + } + _scenario, attacks = await self._build_scenario_and_attacks( + mock_objective_target=mock_objective_target, + mock_objective_scorer=mock_objective_scorer, + seed_groups=groups, + ) + display_groups = {atomic.display_group for atomic in attacks} + assert display_groups == {"violence", "hate"} + + async def test_no_usable_techniques_raises(self, mock_objective_target, mock_objective_scorer): + groups = {"violence": [_make_seed_group(value="obj")]} + with patch.object(DatasetConfiguration, "get_seed_attack_groups", return_value=groups): + scenario = TextAdaptive(objective_scorer=mock_objective_scorer) + await scenario.initialize_async( + objective_target=mock_objective_target, + include_baseline=False, + ) + # Force the factory map to be empty. + with patch.object(scenario, "_get_attack_technique_factories", return_value={}): + with pytest.raises(ValueError, match="no usable techniques"): + await scenario._get_atomic_attacks_async() + + +@pytest.mark.usefixtures(*FIXTURES) +class TestTextAdaptiveBaselinePolicy: + async def test_initialize_async_rejects_explicit_baseline( + self, mock_objective_target, mock_objective_scorer + ): + groups = {"violence": [_make_seed_group(value="obj", harm_categories=["violence"])]} + with patch.object(DatasetConfiguration, "get_seed_attack_groups", return_value=groups): + scenario = TextAdaptive(objective_scorer=mock_objective_scorer) + with pytest.raises(ValueError): + await scenario.initialize_async( + objective_target=mock_objective_target, + include_baseline=True, + ) From 09e3007c47ee9c5b44353c17778557ed2bf86c1f Mon Sep 17 00:00:00 2001 From: hannahwestra25 Date: Mon, 18 May 2026 11:19:48 -0400 Subject: [PATCH 02/12] merge --- pyrit/scenario/scenarios/adaptive/dispatcher.py | 7 +++---- pyrit/scenario/scenarios/adaptive/selector.py | 7 ++++--- .../scenarios/adaptive/text_adaptive.py | 5 +---- .../scenarios/adaptive/test_dispatcher.py | 4 +--- .../scenarios/adaptive/test_selector.py | 6 +----- .../scenarios/adaptive/test_text_adaptive.py | 17 ++++------------- 6 files changed, 14 insertions(+), 32 deletions(-) diff --git a/pyrit/scenario/scenarios/adaptive/dispatcher.py b/pyrit/scenario/scenarios/adaptive/dispatcher.py index ae1087a14..7b5d6b5f3 100644 --- a/pyrit/scenario/scenarios/adaptive/dispatcher.py +++ b/pyrit/scenario/scenarios/adaptive/dispatcher.py @@ -45,7 +45,8 @@ @dataclass class AdaptiveDispatchContext(AttackContext[AttackParameters]): - """Execution context for ``AdaptiveDispatchAttack``. + """ + Execution context for ``AdaptiveDispatchAttack``. No extra state is needed beyond what ``AttackContext`` provides; the dispatcher reads the objective and memory labels from the base class. @@ -91,9 +92,7 @@ def __init__( if not arms: raise ValueError("arms must contain at least one technique") if max_attempts_per_objective < 1: - raise ValueError( - f"max_attempts_per_objective must be >= 1, got {max_attempts_per_objective}" - ) + raise ValueError(f"max_attempts_per_objective must be >= 1, got {max_attempts_per_objective}") super().__init__( objective_target=objective_target, diff --git a/pyrit/scenario/scenarios/adaptive/selector.py b/pyrit/scenario/scenarios/adaptive/selector.py index ff2794757..495c13b7c 100644 --- a/pyrit/scenario/scenarios/adaptive/selector.py +++ b/pyrit/scenario/scenarios/adaptive/selector.py @@ -20,7 +20,8 @@ from __future__ import annotations import random -from typing import TYPE_CHECKING, Callable, Sequence +from collections.abc import Callable, Sequence +from typing import TYPE_CHECKING if TYPE_CHECKING: from pyrit.models.seeds.seed_attack_group import SeedAttackGroup @@ -34,12 +35,12 @@ UNCATEGORIZED_CONTEXT: str = "_uncategorized" -def global_context(_seed_attack_group: "SeedAttackGroup") -> str: +def global_context(_seed_attack_group: SeedAttackGroup) -> str: """Return a constant context so all objectives share one bandit table.""" return GLOBAL_CONTEXT -def harm_category_context(seed_attack_group: "SeedAttackGroup") -> str: +def harm_category_context(seed_attack_group: SeedAttackGroup) -> str: """Return the first harm category on the seed group, or a fallback.""" categories = seed_attack_group.harm_categories if not categories: diff --git a/pyrit/scenario/scenarios/adaptive/text_adaptive.py b/pyrit/scenario/scenarios/adaptive/text_adaptive.py index 2fa97b706..e9be071be 100644 --- a/pyrit/scenario/scenarios/adaptive/text_adaptive.py +++ b/pyrit/scenario/scenarios/adaptive/text_adaptive.py @@ -191,8 +191,6 @@ async def _get_atomic_attacks_async(self) -> list[AtomicAttack]: "Scenario not properly initialized. Call await scenario.initialize_async() before running." ) - from pyrit.scenario.core.atomic_attack import AtomicAttack - selected_arms = sorted({s.value for s in self._scenario_strategies}) factories = self._get_attack_technique_factories() @@ -212,8 +210,7 @@ async def _get_atomic_attacks_async(self) -> list[AtomicAttack]: if not arms: raise ValueError( - "TextAdaptive: no usable techniques after resolving strategies. " - "Check the --strategies selection." + "TextAdaptive: no usable techniques after resolving strategies. Check the --strategies selection." ) selector = AdaptiveTechniqueSelector( diff --git a/tests/unit/scenario/scenarios/adaptive/test_dispatcher.py b/tests/unit/scenario/scenarios/adaptive/test_dispatcher.py index 68051f3d5..9e26425cf 100644 --- a/tests/unit/scenario/scenarios/adaptive/test_dispatcher.py +++ b/tests/unit/scenario/scenarios/adaptive/test_dispatcher.py @@ -116,9 +116,7 @@ async def test_updates_selector_on_each_attempt(self, target, selector): await dispatcher._perform_async(context=_make_context()) # Total attempts across arms must equal sum of selector counts. - total_attempts = sum( - selector.counts(context=GLOBAL_CONTEXT, technique=t)[1] for t in ("a", "b") - ) + total_attempts = sum(selector.counts(context=GLOBAL_CONTEXT, technique=t)[1] for t in ("a", "b")) total_calls = a.execute_async.call_count + b.execute_async.call_count assert total_attempts == total_calls diff --git a/tests/unit/scenario/scenarios/adaptive/test_selector.py b/tests/unit/scenario/scenarios/adaptive/test_selector.py index 7b5c75958..eaddc32ce 100644 --- a/tests/unit/scenario/scenarios/adaptive/test_selector.py +++ b/tests/unit/scenario/scenarios/adaptive/test_selector.py @@ -14,7 +14,6 @@ harm_category_context, ) - ARMS = ["a", "b", "c", "d"] @@ -53,10 +52,7 @@ def test_select_all_unseen_ties_resolved_randomly(self): # With epsilon=0 and an empty table, every arm has estimate 1/1=1.0, # so the result is the seeded random tiebreak. Different seeds should # be able to produce different winners. - winners = { - _seeded_selector(seed=s).select(context=GLOBAL_CONTEXT, arms=ARMS) - for s in range(50) - } + winners = {_seeded_selector(seed=s).select(context=GLOBAL_CONTEXT, arms=ARMS) for s in range(50)} assert len(winners) > 1 assert winners.issubset(set(ARMS)) diff --git a/tests/unit/scenario/scenarios/adaptive/test_text_adaptive.py b/tests/unit/scenario/scenarios/adaptive/test_text_adaptive.py index 82796def4..77176e03e 100644 --- a/tests/unit/scenario/scenarios/adaptive/test_text_adaptive.py +++ b/tests/unit/scenario/scenarios/adaptive/test_text_adaptive.py @@ -27,7 +27,6 @@ from pyrit.scenario.scenarios.adaptive.text_adaptive import TextAdaptive from pyrit.score import TrueFalseScorer - _MOCK_MANY_SHOT_EXAMPLES = [{"question": f"q{i}", "answer": f"a{i}"} for i in range(100)] @@ -193,9 +192,7 @@ async def test_all_atomics_share_one_dispatcher(self, mock_objective_target, moc assert isinstance(next(iter(dispatchers)), AdaptiveDispatchAttack) assert isinstance(scenario._selector, AdaptiveTechniqueSelector) - async def test_global_context_label_when_using_global_extractor( - self, mock_objective_target, mock_objective_scorer - ): + async def test_global_context_label_when_using_global_extractor(self, mock_objective_target, mock_objective_scorer): groups = { "violence": [_make_seed_group(value="obj-1", harm_categories=["violence"])], "hate": [_make_seed_group(value="obj-2", harm_categories=["hate"])], @@ -208,9 +205,7 @@ async def test_global_context_label_when_using_global_extractor( for atomic in attacks: assert atomic._memory_labels[BANDIT_CONTEXT_LABEL] == GLOBAL_CONTEXT - async def test_harm_category_extractor_partitions_labels( - self, mock_objective_target, mock_objective_scorer - ): + async def test_harm_category_extractor_partitions_labels(self, mock_objective_target, mock_objective_scorer): groups = { "violence": [_make_seed_group(value="obj-v", harm_categories=["violence"])], "hate": [_make_seed_group(value="obj-h", harm_categories=["hate"])], @@ -228,9 +223,7 @@ async def test_harm_category_extractor_partitions_labels( async def test_atomic_names_are_unique(self, mock_objective_target, mock_objective_scorer): groups = { - "violence": [ - _make_seed_group(value=f"obj-{i}", harm_categories=["violence"]) for i in range(5) - ], + "violence": [_make_seed_group(value=f"obj-{i}", harm_categories=["violence"]) for i in range(5)], } _scenario, attacks = await self._build_scenario_and_attacks( mock_objective_target=mock_objective_target, @@ -269,9 +262,7 @@ async def test_no_usable_techniques_raises(self, mock_objective_target, mock_obj @pytest.mark.usefixtures(*FIXTURES) class TestTextAdaptiveBaselinePolicy: - async def test_initialize_async_rejects_explicit_baseline( - self, mock_objective_target, mock_objective_scorer - ): + async def test_initialize_async_rejects_explicit_baseline(self, mock_objective_target, mock_objective_scorer): groups = {"violence": [_make_seed_group(value="obj", harm_categories=["violence"])]} with patch.object(DatasetConfiguration, "get_seed_attack_groups", return_value=groups): scenario = TextAdaptive(objective_scorer=mock_objective_scorer) From 70d14c427e6bcbd8a74700d92dc047ae6e657ecc Mon Sep 17 00:00:00 2001 From: hannahwestra25 Date: Mon, 18 May 2026 17:23:52 -0400 Subject: [PATCH 03/12] proofread --- doc/code/scenarios/3_text_adaptive.ipynb | 345 ++++++++++++++++++ doc/code/scenarios/3_text_adaptive.py | 220 +++++++++++ pyrit/scenario/scenarios/adaptive/__init__.py | 4 +- .../scenario/scenarios/adaptive/dispatcher.py | 55 +-- pyrit/scenario/scenarios/adaptive/selector.py | 105 +++--- .../scenarios/adaptive/text_adaptive.py | 74 ++-- .../scenarios/adaptive/test_dispatcher.py | 48 +-- .../scenarios/adaptive/test_selector.py | 96 ++--- .../scenarios/adaptive/test_text_adaptive.py | 8 +- 9 files changed, 771 insertions(+), 184 deletions(-) create mode 100644 doc/code/scenarios/3_text_adaptive.ipynb create mode 100644 doc/code/scenarios/3_text_adaptive.py diff --git a/doc/code/scenarios/3_text_adaptive.ipynb b/doc/code/scenarios/3_text_adaptive.ipynb new file mode 100644 index 000000000..474c1872d --- /dev/null +++ b/doc/code/scenarios/3_text_adaptive.ipynb @@ -0,0 +1,345 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "33fbe4e9", + "metadata": {}, + "source": [ + "# TextAdaptive Scenario\n", + "\n", + "The `TextAdaptive` scenario uses an **epsilon-greedy selector** to intelligently choose\n", + "which attack technique to try for each objective. Unlike static scenarios that run every\n", + "selected technique against every objective, `TextAdaptive` adapts its strategy selection\n", + "based on observed success rates — spending more attempts on techniques that work and\n", + "exploring new ones with a configurable probability.\n", + "\n", + "## How It Works\n", + "\n", + "For each objective (prompt), the selector:\n", + "\n", + "1. **Explores** with probability `epsilon` — picks a technique uniformly at random.\n", + "2. **Exploits** otherwise — picks the technique with the highest observed success rate.\n", + "3. **Stops early** when a technique succeeds, avoiding wasted attempts.\n", + "4. Tries **up to** `max_attempts_per_objective` techniques before moving on.\n", + "\n", + "Unseen techniques start with an optimistic prior (100% success estimate), so the first\n", + "few objectives effectively round-robin through every available technique before the\n", + "selector converges on the best performers.\n", + "\n", + "## Key Differences from Static Scenarios\n", + "\n", + "| Feature | Static Scenarios | TextAdaptive |\n", + "|---------|-----------------|--------------|\n", + "| Technique selection | Run all selected techniques | Selector picks per-objective |\n", + "| Early stopping | No | Yes — stops on first success |\n", + "| Learning | None | Updates success rates after each attempt |\n", + "| Baseline | Prepended automatically | Forbidden — `prompt_sending` is a technique |\n", + "| Efficiency | O(techniques × objectives) | O(max_attempts × objectives) |" + ] + }, + { + "cell_type": "markdown", + "id": "84bce821", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62eebc44", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "from pyrit.scenario.scenarios.adaptive import TextAdaptive, harm_category_context\n", + "from pyrit.registry import TargetRegistry\n", + "from pyrit.scenario import DatasetConfiguration\n", + "from pyrit.scenario.printer.console_printer import ConsoleScenarioResultPrinter\n", + "from pyrit.setup import initialize_from_config_async\n", + "\n", + "await initialize_from_config_async(config_path=Path(\"../../scanner/pyrit_conf.yaml\")) # type: ignore\n", + "\n", + "objective_target = TargetRegistry.get_registry_singleton().get_instance_by_name(\"openai_chat\")\n", + "printer = ConsoleScenarioResultPrinter()" + ] + }, + { + "cell_type": "markdown", + "id": "93a67a83", + "metadata": {}, + "source": [ + "## Basic Usage\n", + "\n", + "The simplest way to run `TextAdaptive` uses all defaults: the selector explores with 20%\n", + "probability, tries up to 3 techniques per objective, and uses the default dataset\n", + "(AIRT harm categories)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f74ce854", + "metadata": {}, + "outputs": [], + "source": [ + "scenario = TextAdaptive()\n", + "\n", + "await scenario.initialize_async( # type: ignore\n", + " objective_target=objective_target,\n", + ")\n", + "result = await scenario.run_async() # type: ignore\n", + "await printer.print_summary_async(result) # type: ignore" + ] + }, + { + "cell_type": "markdown", + "id": "4bc43876", + "metadata": {}, + "source": [ + "## Customizing the Selector\n", + "\n", + "### Epsilon (Exploration Rate)\n", + "\n", + "`epsilon` controls how often the selector explores vs. exploits:\n", + "- `epsilon=0.0` — pure exploitation (always pick the best-known technique)\n", + "- `epsilon=1.0` — pure exploration (random selection every time)\n", + "- `epsilon=0.2` (default) — 20% random exploration, 80% exploitation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac0326e8", + "metadata": {}, + "outputs": [], + "source": [ + "# More explorative selector — useful when you want broader technique coverage\n", + "explorative_scenario = TextAdaptive(epsilon=0.5)\n", + "\n", + "await explorative_scenario.initialize_async( # type: ignore\n", + " objective_target=objective_target,\n", + " dataset_config=DatasetConfiguration(dataset_names=[\"airt_hate\"], max_dataset_size=4),\n", + ")\n", + "explorative_result = await explorative_scenario.run_async() # type: ignore\n", + "await printer.print_summary_async(explorative_result) # type: ignore" + ] + }, + { + "cell_type": "markdown", + "id": "f0ff26ab", + "metadata": {}, + "source": [ + "### Max Attempts Per Objective\n", + "\n", + "`max_attempts_per_objective` caps how many techniques the selector tries before giving\n", + "up on an objective. Setting this higher gives more chances to succeed but costs more\n", + "API calls." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c638b3e5", + "metadata": {}, + "outputs": [], + "source": [ + "persistent_scenario = TextAdaptive(max_attempts_per_objective=5)\n", + "\n", + "await persistent_scenario.initialize_async( # type: ignore\n", + " objective_target=objective_target,\n", + " dataset_config=DatasetConfiguration(dataset_names=[\"airt_violence\"], max_dataset_size=4),\n", + ")\n", + "persistent_result = await persistent_scenario.run_async() # type: ignore\n", + "await printer.print_summary_async(persistent_result) # type: ignore" + ] + }, + { + "cell_type": "markdown", + "id": "3d9e697f", + "metadata": {}, + "source": [ + "## Context-Aware Selection\n", + "\n", + "By default, the selector shares one global table across all objectives. This means\n", + "a technique that works well on hate-speech objectives also gets boosted for\n", + "violence objectives.\n", + "\n", + "To partition the selector by harm category (so each category learns independently),\n", + "pass `harm_category_context` as the `context_extractor`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "954e9c65", + "metadata": {}, + "outputs": [], + "source": [ + "contextual_scenario = TextAdaptive(\n", + " context_extractor=harm_category_context,\n", + " pool_threshold=2,\n", + ")\n", + "\n", + "await contextual_scenario.initialize_async( # type: ignore\n", + " objective_target=objective_target,\n", + " dataset_config=DatasetConfiguration(\n", + " dataset_names=[\"airt_hate\", \"airt_violence\"],\n", + " max_dataset_size=4,\n", + " ),\n", + ")\n", + "contextual_result = await contextual_scenario.run_async() # type: ignore\n", + "await printer.print_summary_async(contextual_result) # type: ignore" + ] + }, + { + "cell_type": "markdown", + "id": "c6319923", + "metadata": {}, + "source": [ + "The `pool_threshold` parameter controls how many local observations are needed before\n", + "the per-category estimate overrides the pooled-global estimate. With\n", + "`pool_threshold=2`, the selector uses the global average until it has seen at least 2\n", + "results for a specific (category, technique) pair." + ] + }, + { + "cell_type": "markdown", + "id": "df56a4af", + "metadata": {}, + "source": [ + "## Strategy Selection\n", + "\n", + "`TextAdaptive` builds its strategy enum dynamically from the scenario-techniques\n", + "catalog. You can restrict which techniques participate using the\n", + "`scenario_strategies` parameter:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbd3b91f", + "metadata": {}, + "outputs": [], + "source": [ + "strategy_class = TextAdaptive.get_strategy_class()\n", + "\n", + "# See all available strategies\n", + "print(\"Available strategies:\")\n", + "for member in strategy_class:\n", + " print(f\" {member.value}\")" + ] + }, + { + "cell_type": "markdown", + "id": "004bea90", + "metadata": {}, + "source": [ + "To limit the selector to only single-turn techniques:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f081a04d", + "metadata": {}, + "outputs": [], + "source": [ + "single_turn_scenario = TextAdaptive()\n", + "\n", + "await single_turn_scenario.initialize_async( # type: ignore\n", + " objective_target=objective_target,\n", + " scenario_strategies=[strategy_class(\"single_turn\")],\n", + " dataset_config=DatasetConfiguration(dataset_names=[\"airt_hate\"], max_dataset_size=4),\n", + ")\n", + "single_turn_result = await single_turn_scenario.run_async() # type: ignore\n", + "await printer.print_summary_async(single_turn_result) # type: ignore" + ] + }, + { + "cell_type": "markdown", + "id": "bd48c512", + "metadata": {}, + "source": [ + "## Deterministic Runs\n", + "\n", + "For reproducibility, pass a `seed` to make the selector's random decisions deterministic:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c022bc5c", + "metadata": {}, + "outputs": [], + "source": [ + "deterministic_scenario = TextAdaptive(seed=42, epsilon=0.3)\n", + "\n", + "await deterministic_scenario.initialize_async( # type: ignore\n", + " objective_target=objective_target,\n", + " dataset_config=DatasetConfiguration(dataset_names=[\"airt_hate\"], max_dataset_size=2),\n", + ")\n", + "deterministic_result = await deterministic_scenario.run_async() # type: ignore\n", + "await printer.print_summary_async(deterministic_result) # type: ignore" + ] + }, + { + "cell_type": "markdown", + "id": "0878f42e", + "metadata": {}, + "source": [ + "## Custom Scorer\n", + "\n", + "By default, `TextAdaptive` uses the standard composite scorer. You can override it\n", + "with any `TrueFalseScorer`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e251342b", + "metadata": {}, + "outputs": [], + "source": [ + "from pyrit.prompt_target import OpenAIChatTarget\n", + "from pyrit.score import SelfAskRefusalScorer, TrueFalseInverterScorer\n", + "\n", + "refusal_scorer = SelfAskRefusalScorer(chat_target=OpenAIChatTarget())\n", + "inverted_scorer = TrueFalseInverterScorer(scorer=refusal_scorer)\n", + "\n", + "custom_scorer_scenario = TextAdaptive(objective_scorer=inverted_scorer)\n", + "\n", + "await custom_scorer_scenario.initialize_async( # type: ignore\n", + " objective_target=objective_target,\n", + " dataset_config=DatasetConfiguration(dataset_names=[\"airt_hate\"], max_dataset_size=2),\n", + ")\n", + "custom_result = await custom_scorer_scenario.run_async() # type: ignore\n", + "await printer.print_summary_async(custom_result) # type: ignore" + ] + }, + { + "cell_type": "markdown", + "id": "ba2bda21", + "metadata": {}, + "source": [ + "## Notes\n", + "\n", + "- **No baseline**: `TextAdaptive` has `BASELINE_POLICY = Forbidden`. The `prompt_sending`\n", + " technique participates as one of the selector's techniques, so a separate baseline is redundant.\n", + "- **Resumability**: Each atomic attack is keyed by `adaptive_{dataset}_{objective_id}`, so\n", + " re-running a scenario picks up where it left off.\n", + "- **Shared selector**: All objectives in a run share the same `AdaptiveTechniqueSelector`\n", + " instance, so learning from one objective immediately benefits the next." + ] + } + ], + "metadata": { + "jupytext": { + "main_language": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/doc/code/scenarios/3_text_adaptive.py b/doc/code/scenarios/3_text_adaptive.py new file mode 100644 index 000000000..a5774753a --- /dev/null +++ b/doc/code/scenarios/3_text_adaptive.py @@ -0,0 +1,220 @@ +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.18.1 +# --- + +# %% [markdown] +# # TextAdaptive Scenario +# +# The `TextAdaptive` scenario uses an **epsilon-greedy selector** to intelligently choose +# which attack technique to try for each objective. Unlike static scenarios that run every +# selected technique against every objective, `TextAdaptive` adapts its strategy selection +# based on observed success rates — spending more attempts on techniques that work and +# exploring new ones with a configurable probability. +# +# ## How It Works +# +# For each objective (prompt), the selector: +# +# 1. **Explores** with probability `epsilon` — picks a technique uniformly at random. +# 2. **Exploits** otherwise — picks the technique with the highest observed success rate. +# 3. **Stops early** when a technique succeeds, avoiding wasted attempts. +# 4. Tries **up to** `max_attempts_per_objective` techniques before moving on. +# +# Unseen techniques start with an optimistic prior (100% success estimate), so the first +# few objectives effectively round-robin through every available technique before the +# selector converges on the best performers. +# +# ## Key Differences from Static Scenarios +# +# | Feature | Static Scenarios | TextAdaptive | +# |---------|-----------------|--------------| +# | Technique selection | Run all selected techniques | Selector picks per-objective | +# | Early stopping | No | Yes — stops on first success | +# | Learning | None | Updates success rates after each attempt | +# | Baseline | Prepended automatically | Forbidden — `prompt_sending` is a technique | +# | Efficiency | O(techniques × objectives) | O(max_attempts × objectives) | + +# %% [markdown] +# ## Setup + +# %% +from pathlib import Path + +from pyrit.scenario.scenarios.adaptive import TextAdaptive, harm_category_context +from pyrit.registry import TargetRegistry +from pyrit.scenario import DatasetConfiguration +from pyrit.scenario.printer.console_printer import ConsoleScenarioResultPrinter +from pyrit.setup import initialize_from_config_async + +await initialize_from_config_async(config_path=Path("../../scanner/pyrit_conf.yaml")) # type: ignore + +objective_target = TargetRegistry.get_registry_singleton().get_instance_by_name("openai_chat") +printer = ConsoleScenarioResultPrinter() + +# %% [markdown] +# ## Basic Usage +# +# The simplest way to run `TextAdaptive` uses all defaults: the selector explores with 20% +# probability, tries up to 3 techniques per objective, and uses the default dataset +# (AIRT harm categories). + +# %% +scenario = TextAdaptive() + +await scenario.initialize_async( # type: ignore + objective_target=objective_target, +) +result = await scenario.run_async() # type: ignore +await printer.print_summary_async(result) # type: ignore + +# %% [markdown] +# ## Customizing the Selector +# +# ### Epsilon (Exploration Rate) +# +# `epsilon` controls how often the selector explores vs. exploits: +# - `epsilon=0.0` — pure exploitation (always pick the best-known technique) +# - `epsilon=1.0` — pure exploration (random selection every time) +# - `epsilon=0.2` (default) — 20% random exploration, 80% exploitation + +# %% +# More explorative selector — useful when you want broader technique coverage +explorative_scenario = TextAdaptive(epsilon=0.5) + +await explorative_scenario.initialize_async( # type: ignore + objective_target=objective_target, + dataset_config=DatasetConfiguration(dataset_names=["airt_hate"], max_dataset_size=4), +) +explorative_result = await explorative_scenario.run_async() # type: ignore +await printer.print_summary_async(explorative_result) # type: ignore + +# %% [markdown] +# ### Max Attempts Per Objective +# +# `max_attempts_per_objective` caps how many techniques the selector tries before giving +# up on an objective. Setting this higher gives more chances to succeed but costs more +# API calls. + +# %% +persistent_scenario = TextAdaptive(max_attempts_per_objective=5) + +await persistent_scenario.initialize_async( # type: ignore + objective_target=objective_target, + dataset_config=DatasetConfiguration(dataset_names=["airt_violence"], max_dataset_size=4), +) +persistent_result = await persistent_scenario.run_async() # type: ignore +await printer.print_summary_async(persistent_result) # type: ignore + +# %% [markdown] +# ## Context-Aware Selection +# +# By default, the selector shares one global table across all objectives. This means +# a technique that works well on hate-speech objectives also gets boosted for +# violence objectives. +# +# To partition the selector by harm category (so each category learns independently), +# pass `harm_category_context` as the `context_extractor`: + +# %% +contextual_scenario = TextAdaptive( + context_extractor=harm_category_context, + pool_threshold=2, +) + +await contextual_scenario.initialize_async( # type: ignore + objective_target=objective_target, + dataset_config=DatasetConfiguration( + dataset_names=["airt_hate", "airt_violence"], + max_dataset_size=4, + ), +) +contextual_result = await contextual_scenario.run_async() # type: ignore +await printer.print_summary_async(contextual_result) # type: ignore + +# %% [markdown] +# The `pool_threshold` parameter controls how many local observations are needed before +# the per-category estimate overrides the pooled-global estimate. With +# `pool_threshold=2`, the selector uses the global average until it has seen at least 2 +# results for a specific (category, technique) pair. + +# %% [markdown] +# ## Strategy Selection +# +# `TextAdaptive` builds its strategy enum dynamically from the scenario-techniques +# catalog. You can restrict which techniques participate using the +# `scenario_strategies` parameter: + +# %% +strategy_class = TextAdaptive.get_strategy_class() + +# See all available strategies +print("Available strategies:") +for member in strategy_class: + print(f" {member.value}") + +# %% [markdown] +# To limit the selector to only single-turn techniques: + +# %% +single_turn_scenario = TextAdaptive() + +await single_turn_scenario.initialize_async( # type: ignore + objective_target=objective_target, + scenario_strategies=[strategy_class("single_turn")], + dataset_config=DatasetConfiguration(dataset_names=["airt_hate"], max_dataset_size=4), +) +single_turn_result = await single_turn_scenario.run_async() # type: ignore +await printer.print_summary_async(single_turn_result) # type: ignore + +# %% [markdown] +# ## Deterministic Runs +# +# For reproducibility, pass a `seed` to make the selector's random decisions deterministic: + +# %% +deterministic_scenario = TextAdaptive(seed=42, epsilon=0.3) + +await deterministic_scenario.initialize_async( # type: ignore + objective_target=objective_target, + dataset_config=DatasetConfiguration(dataset_names=["airt_hate"], max_dataset_size=2), +) +deterministic_result = await deterministic_scenario.run_async() # type: ignore +await printer.print_summary_async(deterministic_result) # type: ignore + +# %% [markdown] +# ## Custom Scorer +# +# By default, `TextAdaptive` uses the standard composite scorer. You can override it +# with any `TrueFalseScorer`: + +# %% +from pyrit.prompt_target import OpenAIChatTarget +from pyrit.score import SelfAskRefusalScorer, TrueFalseInverterScorer + +refusal_scorer = SelfAskRefusalScorer(chat_target=OpenAIChatTarget()) +inverted_scorer = TrueFalseInverterScorer(scorer=refusal_scorer) + +custom_scorer_scenario = TextAdaptive(objective_scorer=inverted_scorer) + +await custom_scorer_scenario.initialize_async( # type: ignore + objective_target=objective_target, + dataset_config=DatasetConfiguration(dataset_names=["airt_hate"], max_dataset_size=2), +) +custom_result = await custom_scorer_scenario.run_async() # type: ignore +await printer.print_summary_async(custom_result) # type: ignore + +# %% [markdown] +# ## Notes +# +# - **No baseline**: `TextAdaptive` has `BASELINE_POLICY = Forbidden`. The `prompt_sending` +# technique participates as one of the selector's techniques, so a separate baseline is redundant. +# - **Resumability**: Each atomic attack is keyed by `adaptive_{dataset}_{objective_id}`, so +# re-running a scenario picks up where it left off. +# - **Shared selector**: All objectives in a run share the same `AdaptiveTechniqueSelector` +# instance, so learning from one objective immediately benefits the next. diff --git a/pyrit/scenario/scenarios/adaptive/__init__.py b/pyrit/scenario/scenarios/adaptive/__init__.py index e06e166a6..2fb58b888 100644 --- a/pyrit/scenario/scenarios/adaptive/__init__.py +++ b/pyrit/scenario/scenarios/adaptive/__init__.py @@ -4,7 +4,7 @@ """Adaptive scenario classes.""" from pyrit.scenario.scenarios.adaptive.dispatcher import ( - BANDIT_CONTEXT_LABEL, + ADAPTIVE_CONTEXT_LABEL, AdaptiveDispatchAttack, ) from pyrit.scenario.scenarios.adaptive.selector import ( @@ -16,9 +16,9 @@ from pyrit.scenario.scenarios.adaptive.text_adaptive import TextAdaptive __all__ = [ + "ADAPTIVE_CONTEXT_LABEL", "AdaptiveDispatchAttack", "AdaptiveTechniqueSelector", - "BANDIT_CONTEXT_LABEL", "ContextExtractor", "TextAdaptive", "global_context", diff --git a/pyrit/scenario/scenarios/adaptive/dispatcher.py b/pyrit/scenario/scenarios/adaptive/dispatcher.py index 7b5d6b5f3..9f6e99c27 100644 --- a/pyrit/scenario/scenarios/adaptive/dispatcher.py +++ b/pyrit/scenario/scenarios/adaptive/dispatcher.py @@ -6,11 +6,11 @@ technique to run for each objective using an ``AdaptiveTechniqueSelector``. This is the execution-side counterpart to the selector. The selector decides -*which arm to pull*; the dispatcher *runs the arm*, records the outcome, and -loops up to ``max_attempts_per_objective`` times. +*which technique to try*; the dispatcher *runs the technique*, records the +outcome, and loops up to ``max_attempts_per_objective`` times. -The dispatcher reads a bandit-context key from -``context.memory_labels[BANDIT_CONTEXT_LABEL]``. The scenario is expected to +The dispatcher reads an adaptive-context key from +``context.memory_labels[ADAPTIVE_CONTEXT_LABEL]``. The scenario is expected to stamp that label per-objective (computed once at atomic-attack construction time via a ``ContextExtractor``). When the label is missing, the global context is used. @@ -36,10 +36,10 @@ logger = logging.getLogger(__name__) -BANDIT_CONTEXT_LABEL: str = "_adaptive_context" -"""Memory-label key whose value is the bandit context string for an objective.""" +"""Memory-label key whose value is the adaptive context string for an objective.""" +ADAPTIVE_CONTEXT_LABEL: str = "_adaptive_context" -ADAPTIVE_ARM_LABEL: str = "_adaptive_arm" +ADAPTIVE_TECHNIQUE_LABEL: str = "_adaptive_technique" ADAPTIVE_ATTEMPT_LABEL: str = "_adaptive_attempt" @@ -56,10 +56,10 @@ class AdaptiveDispatchContext(AttackContext[AttackParameters]): class AdaptiveDispatchAttack(AttackStrategy[AdaptiveDispatchContext, AttackResult]): """ Attack that delegates each attempt to one of several inner ``AttackStrategy`` - instances ("arms"), choosing per attempt via an ``AdaptiveTechniqueSelector``. + instances ("techniques"), choosing per attempt via an ``AdaptiveTechniqueSelector``. For each objective the dispatcher loops up to ``max_attempts_per_objective`` - times. On each iteration it asks the selector which arm to try, executes + times. On each iteration it asks the selector which technique to try, executes the inner attack with the objective, records the outcome on the selector, and stops early on success. @@ -71,7 +71,7 @@ def __init__( self, *, objective_target: PromptTarget, - arms: dict[str, AttackStrategy[Any, AttackResult]], + techniques: dict[str, AttackStrategy[Any, AttackResult]], selector: AdaptiveTechniqueSelector, max_attempts_per_objective: int = 3, ) -> None: @@ -80,17 +80,20 @@ def __init__( objective_target (PromptTarget): The target the inner attacks run against. Stored for identifier/logging parity; the dispatcher does not call the target directly. - arms (dict[str, AttackStrategy[Any, AttackResult]]): Mapping from + techniques (dict[str, AttackStrategy[Any, AttackResult]]): Mapping from technique name to a pre-built inner attack. Must be non-empty. - selector (AdaptiveTechniqueSelector): Shared bandit state. - max_attempts_per_objective (int): Maximum number of arm attempts + These are constructed by the scenario from registered attack + technique factories. + selector (AdaptiveTechniqueSelector): Shared adaptive selection state + that tracks per-technique success rates across objectives. + max_attempts_per_objective (int): Maximum number of technique attempts per objective. Must be >= 1. Defaults to 3. Raises: - ValueError: If ``arms`` is empty or ``max_attempts_per_objective`` < 1. + ValueError: If ``techniques`` is empty or ``max_attempts_per_objective`` < 1. """ - if not arms: - raise ValueError("arms must contain at least one technique") + if not techniques: + raise ValueError("techniques must contain at least one attack technique") if max_attempts_per_objective < 1: raise ValueError(f"max_attempts_per_objective must be >= 1, got {max_attempts_per_objective}") @@ -100,7 +103,7 @@ def __init__( params_type=AttackParameters, logger=logger, ) - self._arms = arms + self._techniques = techniques self._selector = selector self._max_attempts = max_attempts_per_objective @@ -115,26 +118,26 @@ async def _teardown_async(self, *, context: AdaptiveDispatchContext) -> None: pass async def _perform_async(self, *, context: AdaptiveDispatchContext) -> AttackResult: - bandit_context = context.memory_labels.get(BANDIT_CONTEXT_LABEL, GLOBAL_CONTEXT) - arm_names = list(self._arms.keys()) + adaptive_context = context.memory_labels.get(ADAPTIVE_CONTEXT_LABEL, GLOBAL_CONTEXT) + technique_names = list(self._techniques.keys()) last_result: AttackResult | None = None trail: list[dict[str, str]] = [] for attempt_idx in range(self._max_attempts): - chosen = self._selector.select(context=bandit_context, arms=arm_names) - inner = self._arms[chosen] + chosen = self._selector.select(context=adaptive_context, techniques=technique_names) + inner = self._techniques[chosen] attempt_labels = { **context.memory_labels, - ADAPTIVE_ARM_LABEL: chosen, + ADAPTIVE_TECHNIQUE_LABEL: chosen, ADAPTIVE_ATTEMPT_LABEL: str(attempt_idx + 1), } logger.debug( - "AdaptiveDispatchAttack: attempt %d/%d context=%r arm=%r", + "AdaptiveDispatchAttack: attempt %d/%d context=%r technique=%r", attempt_idx + 1, self._max_attempts, - bandit_context, + adaptive_context, chosen, ) @@ -143,7 +146,7 @@ async def _perform_async(self, *, context: AdaptiveDispatchContext) -> AttackRes memory_labels=attempt_labels, ) success = result.outcome == AttackOutcome.SUCCESS - self._selector.update(context=bandit_context, technique=chosen, success=success) + self._selector.record_outcome(context=adaptive_context, technique=chosen, success=success) trail.append({"technique": chosen, "outcome": result.outcome.value}) last_result = result @@ -156,6 +159,6 @@ async def _perform_async(self, *, context: AdaptiveDispatchContext) -> AttackRes last_result.metadata = { **last_result.metadata, "adaptive_attempts": trail, - "adaptive_context": bandit_context, + "adaptive_context": adaptive_context, } return last_result diff --git a/pyrit/scenario/scenarios/adaptive/selector.py b/pyrit/scenario/scenarios/adaptive/selector.py index 495c13b7c..5f39b49c0 100644 --- a/pyrit/scenario/scenarios/adaptive/selector.py +++ b/pyrit/scenario/scenarios/adaptive/selector.py @@ -5,8 +5,8 @@ Adaptive technique selection for the ``TextAdaptive`` scenario. This module provides: - - ``AdaptiveTechniqueSelector``: an epsilon-greedy bandit keyed by - ``(context, technique)`` that tracks successes/attempts per arm and + - ``AdaptiveTechniqueSelector``: an epsilon-greedy selector keyed by + ``(context, technique)`` that tracks successes/attempts per technique and picks the next technique to try. - ``ContextExtractor``: a callable alias for deriving a context string from a ``SeedAttackGroup``, plus two ready-made extractors: @@ -28,15 +28,24 @@ ContextExtractor = Callable[["SeedAttackGroup"], str] -"""Maps a ``SeedAttackGroup`` to a bandit context key.""" +"""Maps a ``SeedAttackGroup`` to an adaptive context key (e.g. a harm category).""" +# Sentinel context keys used when no per-objective partitioning is desired +# or when a seed group lacks harm category metadata. GLOBAL_CONTEXT: str = "_global" +"""Default context key: all objectives share one selection table.""" UNCATEGORIZED_CONTEXT: str = "_uncategorized" +"""Fallback context for seed groups with no harm category metadata.""" + + +# Context extractors are module-level functions so they can be passed directly +# as the ``context_extractor`` argument to ``TextAdaptive``. They implement the +# ``ContextExtractor`` callable protocol. def global_context(_seed_attack_group: SeedAttackGroup) -> str: - """Return a constant context so all objectives share one bandit table.""" + """Return a constant context so all objectives share one selection table.""" return GLOBAL_CONTEXT @@ -54,28 +63,29 @@ class AdaptiveTechniqueSelector: The selector maintains a table of ``(context, technique) -> (successes, attempts)`` counts. ``select`` returns the next technique to try for a given context, - and ``update`` records the outcome of an attempt. + and ``record_outcome`` records the outcome of an attempt. Selection uses epsilon-greedy with optimistic initialization: - - With probability ``epsilon``, pick uniformly at random from ``arms``. - - Otherwise, pick the arm with the highest estimated success rate. - The estimate is ``(successes + 1) / (attempts + 1)``, so unseen - arms look like 100% success and are explored first via tiebreak. - - When a ``(context, arm)`` cell has fewer than ``pool_threshold`` attempts, - the estimate falls back to the pooled global rate for that arm across all - contexts. This lets per-context bandits benefit from cross-context data + - With probability ``epsilon``, pick uniformly at random from ``techniques``. + - Otherwise, pick the technique with the highest estimated success rate. + The estimate is ``(successes + 1) / (attempts + 1)`` (Laplace smoothing), + so unseen techniques start at 100% and are explored first via tiebreak. + + When a ``(context, technique)`` cell has fewer than ``pool_threshold`` attempts, + the estimate falls back to the pooled global rate for that technique across all + contexts. This lets per-context selectors benefit from cross-context data until they have enough local samples. Set ``pool_threshold=1`` to disable pooling (use the local estimate as soon as any attempt is recorded). Note: This class is not thread/async safe. It assumes sequential calls, - which matches the base ``Scenario._execute_scenario_async`` loop. + which matches the base ``Scenario._execute_scenario_async`` loop + (same pattern as all other scenarios). """ - # Tolerance for tiebreaking in exploitation. Estimates are rational today, - # so equality works, but this guards against future estimators that may - # introduce floating-point drift. + # Tolerance for floating-point comparison when tiebreaking in exploitation. + # Current estimates are exact rationals, but this guards against future + # estimator changes that may introduce floating-point drift. _TIE_TOL: float = 1e-12 def __init__( @@ -88,11 +98,16 @@ def __init__( """ Args: epsilon (float): Exploration probability in [0.0, 1.0]. Defaults to 0.2. - pool_threshold (int): Minimum per-(context, arm) attempts before - the local estimate replaces the pooled-global estimate. Must - be >= 1; set to 1 to disable pooling. Defaults to 3. - rng (random.Random | None): Seedable RNG for deterministic tests. - Defaults to a fresh ``random.Random()``. + pool_threshold (int): Minimum per-(context, technique) attempts before + the local estimate replaces the pooled-global estimate. Until this + threshold is reached, the selector uses the technique's average + across all contexts. Must be >= 1; set to 1 to disable pooling. + Defaults to 3. + rng (random.Random | None): A ``random.Random`` instance for + reproducible selection decisions. Using a dedicated RNG (rather + than a bare float) enables seeded determinism across the full + sequence of select calls within a run. Defaults to a fresh + unseeded ``random.Random()``. Raises: ValueError: If ``epsilon`` is outside [0.0, 1.0] or @@ -111,39 +126,39 @@ def __init__( # ``_estimate``'s pooled-backoff branch is O(1). self._global_counts: dict[str, tuple[int, int]] = {} - def select(self, *, context: str, arms: Sequence[str]) -> str: + def select(self, *, context: str, techniques: Sequence[str]) -> str: """ - Pick the next arm to try for ``context``. + Pick the next technique to try for ``context``. Args: context (str): The context key (e.g. ``"_global"`` or a harm category). - arms (Sequence[str]): The candidate technique names. + techniques (Sequence[str]): The candidate technique names. Returns: - str: The chosen arm name. + str: The chosen technique name. Raises: - ValueError: If ``arms`` is empty. + ValueError: If ``techniques`` is empty. """ - arm_list = list(arms) - if not arm_list: - raise ValueError("arms must contain at least one technique") + technique_list = list(techniques) + if not technique_list: + raise ValueError("techniques must contain at least one entry") if self._rng.random() < self._epsilon: - return self._rng.choice(arm_list) + return self._rng.choice(technique_list) - estimates = {arm: self._estimate(context=context, arm=arm) for arm in arm_list} + estimates = {t: self._estimate(context=context, technique=t) for t in technique_list} best = max(estimates.values()) - winners = [arm for arm, value in estimates.items() if value >= best - self._TIE_TOL] + winners = [t for t, value in estimates.items() if value >= best - self._TIE_TOL] return self._rng.choice(winners) - def update(self, *, context: str, technique: str, success: bool) -> None: + def record_outcome(self, *, context: str, technique: str, success: bool) -> None: """ - Record the outcome of an attempt. + Record the outcome of an attack attempt for a given technique and context. Args: context (str): The context key the decision was made under. - technique (str): The arm that was tried. + technique (str): The technique that was tried. success (bool): Whether the attempt succeeded. """ successes, attempts = self._counts.get((context, technique), (0, 0)) @@ -160,11 +175,13 @@ def update(self, *, context: str, technique: str, success: bool) -> None: def success_rate(self, *, context: str, technique: str) -> float: """ - Return the smoothed success-rate estimate for an arm in a context. + Return the Laplace-smoothed success-rate estimate for a technique in a context. - This is the same value used internally for exploitation decisions. + The "smoothed" rate is ``(successes + 1) / (attempts + 1)`` — Laplace smoothing + provides an optimistic prior for unseen techniques (estimate = 1.0) and avoids + division by zero. This is the same value used internally for exploitation decisions. """ - return self._estimate(context=context, arm=technique) + return self._estimate(context=context, technique=technique) def counts(self, *, context: str, technique: str) -> tuple[int, int]: """Return raw ``(successes, attempts)`` for a ``(context, technique)`` cell.""" @@ -174,15 +191,15 @@ def snapshot(self) -> dict[tuple[str, str], tuple[int, int]]: """Return a shallow copy of the full counts table (for logging/debug).""" return dict(self._counts) - def _estimate(self, *, context: str, arm: str) -> float: + def _estimate(self, *, context: str, technique: str) -> float: """ - Smoothed success-rate estimate for ``(context, arm)``. + Laplace-smoothed success-rate estimate for ``(context, technique)``. Below ``pool_threshold`` local attempts, the estimate uses the - pooled-global success rate for the arm across all contexts. + pooled-global success rate for the technique across all contexts. """ - local_s, local_n = self._counts.get((context, arm), (0, 0)) + local_s, local_n = self._counts.get((context, technique), (0, 0)) if local_n >= self._pool_threshold: return (local_s + 1) / (local_n + 1) - global_s, global_n = self._global_counts.get(arm, (0, 0)) + global_s, global_n = self._global_counts.get(technique, (0, 0)) return (global_s + 1) / (global_n + 1) diff --git a/pyrit/scenario/scenarios/adaptive/text_adaptive.py b/pyrit/scenario/scenarios/adaptive/text_adaptive.py index e9be071be..fe33ea8fc 100644 --- a/pyrit/scenario/scenarios/adaptive/text_adaptive.py +++ b/pyrit/scenario/scenarios/adaptive/text_adaptive.py @@ -3,18 +3,18 @@ """ TextAdaptive scenario — picks attack techniques per-objective using an -epsilon-greedy bandit informed by observed per-run success rates. +epsilon-greedy selector informed by observed per-run success rates. Unlike static scenarios (which run every selected technique against every objective), TextAdaptive runs **up to** ``max_attempts_per_objective`` techniques per objective and stops early when one succeeds. Which technique -to try next is decided by an ``AdaptiveTechniqueSelector`` whose Q-values are +to try next is decided by an ``AdaptiveTechniqueSelector`` whose estimates are updated after every attempt. -The set of available "arms" comes from the selected scenario strategies, so -``--strategies single_turn`` restricts the bandit to single-turn techniques, +The set of available techniques comes from the selected scenario strategies, so +``--strategies single_turn`` restricts the selector to single-turn techniques, etc. The default selector uses a single global context; pass a different -``context_extractor`` (e.g., ``harm_category_context``) to partition Q-values +``context_extractor`` (e.g., ``harm_category_context``) to partition estimates per category. """ @@ -32,7 +32,7 @@ from pyrit.scenario.core.scenario import BaselinePolicy, Scenario from pyrit.scenario.core.scenario_strategy import ScenarioStrategy from pyrit.scenario.scenarios.adaptive.dispatcher import ( - BANDIT_CONTEXT_LABEL, + ADAPTIVE_CONTEXT_LABEL, AdaptiveDispatchAttack, ) from pyrit.scenario.scenarios.adaptive.selector import ( @@ -71,19 +71,19 @@ def _build_text_adaptive_strategy() -> type[ScenarioStrategy]: class TextAdaptive(Scenario): """ Adaptive text-attack scenario that selects techniques per-objective using - an epsilon-greedy bandit over the set of selected strategies. + an epsilon-greedy selector over the set of selected strategies. - The bandit: - - Picks an arm uniformly at random with probability ``epsilon``. - - Otherwise exploits the highest observed success rate. Unseen arms + The selector: + - Picks a technique uniformly at random with probability ``epsilon``. + - Otherwise exploits the highest observed success rate. Unseen techniques have an optimistic prior so the first few objectives effectively round-robin through every available technique. - Pools across contexts when a context has fewer than - ``pool_threshold`` observations for an arm. + ``pool_threshold`` observations for a technique. A baseline ``PromptSendingAttack`` is **not** prepended — every objective runs through the dispatcher, and ``prompt_sending`` participates as one of - the bandit's arms. + the selector's techniques. """ VERSION: int = 1 @@ -141,18 +141,18 @@ def __init__( Args: objective_scorer (TrueFalseScorer | None): Scorer used to judge each response. Defaults to the composite scorer built from the base class. - epsilon (float): Exploration probability for the bandit. Defaults to 0.2. - pool_threshold (int): Minimum per-(context, arm) attempts before the + epsilon (float): Exploration probability for the selector. Defaults to 0.2. + pool_threshold (int): Minimum per-(context, technique) attempts before the local estimate overrides the pooled-global estimate. Set to 1 to disable pooling. Defaults to 3. max_attempts_per_objective (int): Maximum techniques tried per objective before giving up. Defaults to 3. - seed (int | None): RNG seed for deterministic bandit decisions. + seed (int | None): RNG seed for deterministic selection decisions. Defaults to ``None`` (non-deterministic). context_extractor (ContextExtractor): Function mapping a - ``SeedAttackGroup`` to a bandit context key. Defaults to - ``global_context`` (one shared bandit table). Use - ``harm_category_context`` to partition Q-values by harm category. + ``SeedAttackGroup`` to a context key. Defaults to + ``global_context`` (one shared selection table). Use + ``harm_category_context`` to partition estimates by harm category. scenario_result_id (str | None): ID of an existing ``ScenarioResult`` to resume. """ @@ -182,22 +182,23 @@ async def _get_atomic_attacks_async(self) -> list[AtomicAttack]: ``AdaptiveDispatchAttack`` (and therefore a single ``AdaptiveTechniqueSelector``). - This is the bandit's "single working memory shared across objectives" - plumbing: each per-objective ``AtomicAttack`` consults and updates the - same selector via the same dispatcher instance. + Each per-objective ``AtomicAttack`` consults and updates the same + selector via the same dispatcher instance, so learning from one + objective immediately benefits the next. """ if self._objective_target is None: - raise ValueError( - "Scenario not properly initialized. Call await scenario.initialize_async() before running." - ) + raise ValueError("objective_target must be set before creating attacks") - selected_arms = sorted({s.value for s in self._scenario_strategies}) + selected_techniques = sorted({s.value for s in self._scenario_strategies}) factories = self._get_attack_technique_factories() - # Build each arm's inner attack once and reuse across all objectives. + # Build each technique's inner attack once and reuse across all objectives. + # Skip factories that require a seed_technique (e.g. crescendo_simulated) + # since the dispatcher cannot merge technique seeds into the objective's + # seed group at dispatch time. scoring_config = AttackScoringConfig(objective_scorer=cast("TrueFalseScorer", self._objective_scorer)) - arms: dict[str, AttackStrategy] = {} - for technique_name in selected_arms: + techniques: dict[str, AttackStrategy] = {} + for technique_name in selected_techniques: factory = factories.get(technique_name) if factory is None: logger.warning(f"No factory for technique '{technique_name}', skipping.") @@ -206,9 +207,15 @@ async def _get_atomic_attacks_async(self) -> list[AtomicAttack]: objective_target=self._objective_target, attack_scoring_config=scoring_config, ) - arms[technique_name] = technique.attack + if technique.seed_technique is not None: + logger.debug( + "Skipping technique '%s': requires seed_technique which adaptive dispatch cannot handle.", + technique_name, + ) + continue + techniques[technique_name] = technique.attack - if not arms: + if not techniques: raise ValueError( "TextAdaptive: no usable techniques after resolving strategies. Check the --strategies selection." ) @@ -220,13 +227,10 @@ async def _get_atomic_attacks_async(self) -> list[AtomicAttack]: ) dispatcher = AdaptiveDispatchAttack( objective_target=self._objective_target, - arms=arms, + techniques=techniques, selector=selector, max_attempts_per_objective=self._max_attempts_per_objective, ) - # Stash for tests / debugging; not part of the public API. - self._selector = selector - self._dispatcher = dispatcher seed_groups_by_dataset = self._dataset_config.get_seed_attack_groups() atomic_attacks: list[AtomicAttack] = [] @@ -260,7 +264,7 @@ def _build_atomic_for_seed_group( memory_labels = { **self._memory_labels, - BANDIT_CONTEXT_LABEL: bandit_context, + ADAPTIVE_CONTEXT_LABEL: bandit_context, } return AtomicAttack( atomic_attack_name=atomic_attack_name, diff --git a/tests/unit/scenario/scenarios/adaptive/test_dispatcher.py b/tests/unit/scenario/scenarios/adaptive/test_dispatcher.py index 9e26425cf..87170faa1 100644 --- a/tests/unit/scenario/scenarios/adaptive/test_dispatcher.py +++ b/tests/unit/scenario/scenarios/adaptive/test_dispatcher.py @@ -9,9 +9,9 @@ from pyrit.executor.attack.core.attack_parameters import AttackParameters from pyrit.models import AttackOutcome, AttackResult from pyrit.scenario.scenarios.adaptive.dispatcher import ( - ADAPTIVE_ARM_LABEL, ADAPTIVE_ATTEMPT_LABEL, - BANDIT_CONTEXT_LABEL, + ADAPTIVE_CONTEXT_LABEL, + ADAPTIVE_TECHNIQUE_LABEL, AdaptiveDispatchAttack, AdaptiveDispatchContext, ) @@ -53,9 +53,9 @@ def target() -> MagicMock: class TestInit: @pytest.mark.usefixtures("patch_central_database") - def test_init_rejects_empty_arms(self, target, selector): - with pytest.raises(ValueError, match="arms"): - AdaptiveDispatchAttack(objective_target=target, arms={}, selector=selector) + def test_init_rejects_empty_techniques(self, target, selector): + with pytest.raises(ValueError, match="techniques"): + AdaptiveDispatchAttack(objective_target=target, techniques={}, selector=selector) @pytest.mark.parametrize("bad_max", [0, -1]) @pytest.mark.usefixtures("patch_central_database") @@ -63,7 +63,7 @@ def test_init_rejects_invalid_max_attempts(self, target, selector, bad_max): with pytest.raises(ValueError, match="max_attempts_per_objective"): AdaptiveDispatchAttack( objective_target=target, - arms={"a": _make_inner_attack(name="a", outcomes=[AttackOutcome.SUCCESS])}, + techniques={"a": _make_inner_attack(name="a", outcomes=[AttackOutcome.SUCCESS])}, selector=selector, max_attempts_per_objective=bad_max, ) @@ -76,7 +76,7 @@ async def test_stops_on_first_success(self, target, selector): b = _make_inner_attack(name="b", outcomes=[AttackOutcome.SUCCESS]) dispatcher = AdaptiveDispatchAttack( objective_target=target, - arms={"a": a, "b": b}, + techniques={"a": a, "b": b}, selector=selector, max_attempts_per_objective=5, ) @@ -92,7 +92,7 @@ async def test_retries_until_max_attempts_on_failure(self, target, selector): b = _make_inner_attack(name="b", outcomes=[AttackOutcome.FAILURE] * 3) dispatcher = AdaptiveDispatchAttack( objective_target=target, - arms={"a": a, "b": b}, + techniques={"a": a, "b": b}, selector=selector, max_attempts_per_objective=3, ) @@ -108,7 +108,7 @@ async def test_updates_selector_on_each_attempt(self, target, selector): b = _make_inner_attack(name="b", outcomes=[AttackOutcome.SUCCESS]) dispatcher = AdaptiveDispatchAttack( objective_target=target, - arms={"a": a, "b": b}, + techniques={"a": a, "b": b}, selector=selector, max_attempts_per_objective=3, ) @@ -124,7 +124,7 @@ async def test_passes_objective_to_inner(self, target, selector): a = _make_inner_attack(name="a", outcomes=[AttackOutcome.SUCCESS]) dispatcher = AdaptiveDispatchAttack( objective_target=target, - arms={"a": a}, + techniques={"a": a}, selector=selector, ) @@ -133,11 +133,11 @@ async def test_passes_objective_to_inner(self, target, selector): kwargs = a.execute_async.call_args.kwargs assert kwargs["objective"] == "my-goal" - async def test_attaches_arm_and_attempt_labels(self, target, selector): + async def test_attaches_technique_and_attempt_labels(self, target, selector): a = _make_inner_attack(name="a", outcomes=[AttackOutcome.SUCCESS]) dispatcher = AdaptiveDispatchAttack( objective_target=target, - arms={"a": a}, + techniques={"a": a}, selector=selector, ) @@ -145,24 +145,24 @@ async def test_attaches_arm_and_attempt_labels(self, target, selector): labels = a.execute_async.call_args.kwargs["memory_labels"] assert labels["foo"] == "bar" # caller labels preserved - assert labels[ADAPTIVE_ARM_LABEL] == "a" + assert labels[ADAPTIVE_TECHNIQUE_LABEL] == "a" assert labels[ADAPTIVE_ATTEMPT_LABEL] == "1" - async def test_uses_bandit_context_from_label(self, target, selector): - # Two arms; one has been heavily rewarded under context "violence" only. + async def test_uses_adaptive_context_from_label(self, target, selector): + # Two techniques; one has been heavily rewarded under context "violence" only. a = _make_inner_attack(name="a", outcomes=[AttackOutcome.SUCCESS]) b = _make_inner_attack(name="b", outcomes=[AttackOutcome.SUCCESS]) for _ in range(5): - selector.update(context="violence", technique="b", success=True) + selector.record_outcome(context="violence", technique="b", success=True) for _ in range(5): - selector.update(context="violence", technique="a", success=False) + selector.record_outcome(context="violence", technique="a", success=False) dispatcher = AdaptiveDispatchAttack( objective_target=target, - arms={"a": a, "b": b}, + techniques={"a": a, "b": b}, selector=selector, ) - ctx = _make_context(labels={BANDIT_CONTEXT_LABEL: "violence"}) + ctx = _make_context(labels={ADAPTIVE_CONTEXT_LABEL: "violence"}) await dispatcher._perform_async(context=ctx) # Exploit should have picked "b" first. @@ -173,7 +173,7 @@ async def test_falls_back_to_global_context_when_label_missing(self, target, sel a = _make_inner_attack(name="a", outcomes=[AttackOutcome.SUCCESS]) dispatcher = AdaptiveDispatchAttack( objective_target=target, - arms={"a": a}, + techniques={"a": a}, selector=selector, ) await dispatcher._perform_async(context=_make_context(labels={})) @@ -182,12 +182,12 @@ async def test_falls_back_to_global_context_when_label_missing(self, target, sel assert selector.counts(context=GLOBAL_CONTEXT, technique="a") == (1, 1) async def test_metadata_records_adaptive_trail(self, target, selector): - # Arm "a" fails on the first attempt then succeeds; verify the trail + # Technique "a" fails on the first attempt then succeeds; verify the trail # captures both attempts in order. a = _make_inner_attack(name="a", outcomes=[AttackOutcome.FAILURE, AttackOutcome.SUCCESS]) dispatcher = AdaptiveDispatchAttack( objective_target=target, - arms={"a": a}, + techniques={"a": a}, selector=selector, max_attempts_per_objective=3, ) @@ -207,7 +207,7 @@ class TestValidate: def test_validate_rejects_empty_objective(self, target, selector, bad_objective): dispatcher = AdaptiveDispatchAttack( objective_target=target, - arms={"a": _make_inner_attack(name="a", outcomes=[AttackOutcome.SUCCESS])}, + techniques={"a": _make_inner_attack(name="a", outcomes=[AttackOutcome.SUCCESS])}, selector=selector, ) with pytest.raises(ValueError, match="objective"): @@ -216,7 +216,7 @@ def test_validate_rejects_empty_objective(self, target, selector, bad_objective) def test_validate_accepts_normal_objective(self, target, selector): dispatcher = AdaptiveDispatchAttack( objective_target=target, - arms={"a": _make_inner_attack(name="a", outcomes=[AttackOutcome.SUCCESS])}, + techniques={"a": _make_inner_attack(name="a", outcomes=[AttackOutcome.SUCCESS])}, selector=selector, ) # Does not raise. diff --git a/tests/unit/scenario/scenarios/adaptive/test_selector.py b/tests/unit/scenario/scenarios/adaptive/test_selector.py index eaddc32ce..4766c16f0 100644 --- a/tests/unit/scenario/scenarios/adaptive/test_selector.py +++ b/tests/unit/scenario/scenarios/adaptive/test_selector.py @@ -14,7 +14,7 @@ harm_category_context, ) -ARMS = ["a", "b", "c", "d"] +TECHNIQUES = ["a", "b", "c", "d"] def _seeded_selector(*, epsilon: float = 0.0, pool_threshold: int = 3, seed: int = 0) -> AdaptiveTechniqueSelector: @@ -43,77 +43,77 @@ def test_init_rejects_pool_threshold_below_one(self): class TestAdaptiveTechniqueSelectorSelect: - def test_select_empty_arms_raises(self): + def test_select_empty_techniques_raises(self): selector = _seeded_selector() - with pytest.raises(ValueError, match="arms"): - selector.select(context=GLOBAL_CONTEXT, arms=[]) + with pytest.raises(ValueError, match="techniques"): + selector.select(context=GLOBAL_CONTEXT, techniques=[]) def test_select_all_unseen_ties_resolved_randomly(self): - # With epsilon=0 and an empty table, every arm has estimate 1/1=1.0, + # With epsilon=0 and an empty table, every technique has estimate 1/1=1.0, # so the result is the seeded random tiebreak. Different seeds should # be able to produce different winners. - winners = {_seeded_selector(seed=s).select(context=GLOBAL_CONTEXT, arms=ARMS) for s in range(50)} + winners = {_seeded_selector(seed=s).select(context=GLOBAL_CONTEXT, techniques=TECHNIQUES) for s in range(50)} assert len(winners) > 1 - assert winners.issubset(set(ARMS)) + assert winners.issubset(set(TECHNIQUES)) def test_select_exploits_clear_winner(self): selector = _seeded_selector(pool_threshold=1) # Give "b" a track record of pure success, others pure failure. for _ in range(5): - selector.update(context=GLOBAL_CONTEXT, technique="b", success=True) - for arm in ("a", "c", "d"): + selector.record_outcome(context=GLOBAL_CONTEXT, technique="b", success=True) + for technique in ("a", "c", "d"): for _ in range(5): - selector.update(context=GLOBAL_CONTEXT, technique=arm, success=False) + selector.record_outcome(context=GLOBAL_CONTEXT, technique=technique, success=False) # With epsilon=0, every selection must exploit the winner. for _ in range(20): - assert selector.select(context=GLOBAL_CONTEXT, arms=ARMS) == "b" + assert selector.select(context=GLOBAL_CONTEXT, techniques=TECHNIQUES) == "b" def test_select_epsilon_one_is_pure_random(self): selector = _seeded_selector(epsilon=1.0) # Bias the table heavily toward "a"; with epsilon=1 it must still be ignored. for _ in range(20): - selector.update(context=GLOBAL_CONTEXT, technique="a", success=True) + selector.record_outcome(context=GLOBAL_CONTEXT, technique="a", success=True) - picks = [selector.select(context=GLOBAL_CONTEXT, arms=ARMS) for _ in range(200)] - assert set(picks) == set(ARMS) + picks = [selector.select(context=GLOBAL_CONTEXT, techniques=TECHNIQUES) for _ in range(200)] + assert set(picks) == set(TECHNIQUES) def test_select_epsilon_zero_never_explores(self): selector = _seeded_selector(epsilon=0.0, pool_threshold=1) for _ in range(3): - selector.update(context=GLOBAL_CONTEXT, technique="a", success=True) - # Make the other arms tried-and-failed so they fall below "a"'s estimate; - # unseen arms would otherwise tie at the optimistic 1.0. - for arm in ("b", "c", "d"): - selector.update(context=GLOBAL_CONTEXT, technique=arm, success=False) + selector.record_outcome(context=GLOBAL_CONTEXT, technique="a", success=True) + # Make the other techniques tried-and-failed so they fall below "a"'s estimate; + # unseen techniques would otherwise tie at the optimistic 1.0. + for technique in ("b", "c", "d"): + selector.record_outcome(context=GLOBAL_CONTEXT, technique=technique, success=False) for _ in range(50): - assert selector.select(context=GLOBAL_CONTEXT, arms=ARMS) == "a" + assert selector.select(context=GLOBAL_CONTEXT, techniques=TECHNIQUES) == "a" def test_select_cold_start_round_robins(self): - # Optimistic init + epsilon=0: untried arms tie at 1.0 and beat tried-and-failed - # arms (1/2 = 0.5). So the first failures push each arm to "tried" exactly once - # before any arm gets tried twice. + # Optimistic init + epsilon=0: untried techniques tie at 1.0 and beat tried-and-failed + # techniques (1/2 = 0.5). So the first failures push each technique to "tried" exactly once + # before any technique gets tried twice. selector = _seeded_selector(pool_threshold=1) tried: list[str] = [] - for _ in range(len(ARMS)): - arm = selector.select(context=GLOBAL_CONTEXT, arms=ARMS) - tried.append(arm) - selector.update(context=GLOBAL_CONTEXT, technique=arm, success=False) - assert sorted(tried) == sorted(ARMS) + for _ in range(len(TECHNIQUES)): + technique = selector.select(context=GLOBAL_CONTEXT, techniques=TECHNIQUES) + tried.append(technique) + selector.record_outcome(context=GLOBAL_CONTEXT, technique=technique, success=False) + assert sorted(tried) == sorted(TECHNIQUES) class TestAdaptiveTechniqueSelectorUpdate: - def test_update_accumulates_counts(self): + def test_record_outcome_accumulates_counts(self): selector = _seeded_selector() - selector.update(context="ctx", technique="a", success=True) - selector.update(context="ctx", technique="a", success=False) - selector.update(context="ctx", technique="a", success=True) + selector.record_outcome(context="ctx", technique="a", success=True) + selector.record_outcome(context="ctx", technique="a", success=False) + selector.record_outcome(context="ctx", technique="a", success=True) assert selector.counts(context="ctx", technique="a") == (2, 3) - def test_update_separate_contexts_are_independent(self): + def test_record_outcome_separate_contexts_are_independent(self): selector = _seeded_selector() - selector.update(context="x", technique="a", success=True) - selector.update(context="y", technique="a", success=False) + selector.record_outcome(context="x", technique="a", success=True) + selector.record_outcome(context="y", technique="a", success=False) assert selector.counts(context="x", technique="a") == (1, 1) assert selector.counts(context="y", technique="a") == (0, 1) @@ -121,21 +121,21 @@ def test_counts_default_zero_for_unseen(self): selector = _seeded_selector() assert selector.counts(context="missing", technique="missing") == (0, 0) - def test_update_keeps_pooled_global_counts_in_sync(self): + def test_record_outcome_keeps_pooled_global_counts_in_sync(self): # Pooled-global counts back the O(1) pooled-backoff branch in _estimate. - # They must aggregate across contexts for a given arm. + # They must aggregate across contexts for a given technique. selector = _seeded_selector(pool_threshold=5) - selector.update(context="x", technique="a", success=True) - selector.update(context="y", technique="a", success=False) - selector.update(context="z", technique="a", success=True) - selector.update(context="x", technique="b", success=True) + selector.record_outcome(context="x", technique="a", success=True) + selector.record_outcome(context="y", technique="a", success=False) + selector.record_outcome(context="z", technique="a", success=True) + selector.record_outcome(context="x", technique="b", success=True) # Below the local threshold, _estimate must use the pooled-global rate. - # arm "a": 2 successes / 3 attempts -> (2+1)/(3+1) = 0.75 + # technique "a": 2 successes / 3 attempts -> (2+1)/(3+1) = 0.75 assert selector.success_rate(context="new_ctx", technique="a") == pytest.approx(0.75) - # arm "b": 1/1 -> (1+1)/(1+1) = 1.0 + # technique "b": 1/1 -> (1+1)/(1+1) = 1.0 assert selector.success_rate(context="new_ctx", technique="b") == pytest.approx(1.0) - # Unseen arm "c" -> (0+1)/(0+1) = 1.0 + # Unseen technique "c" -> (0+1)/(0+1) = 1.0 assert selector.success_rate(context="new_ctx", technique="c") == pytest.approx(1.0) @@ -148,17 +148,17 @@ def test_success_rate_unseen_is_one(self): def test_success_rate_local_when_above_threshold(self): selector = _seeded_selector(pool_threshold=2) for _ in range(3): - selector.update(context="ctx", technique="a", success=True) + selector.record_outcome(context="ctx", technique="a", success=True) # (3 + 1) / (3 + 1) = 1.0 assert selector.success_rate(context="ctx", technique="a") == pytest.approx(1.0) def test_success_rate_pools_when_below_threshold(self): selector = _seeded_selector(pool_threshold=5) # Local cell has only 1 attempt (below threshold). - selector.update(context="ctx", technique="a", success=False) - # Other contexts have plenty of data for arm "a". + selector.record_outcome(context="ctx", technique="a", success=False) + # Other contexts have plenty of data for technique "a". for _ in range(10): - selector.update(context="other", technique="a", success=True) + selector.record_outcome(context="other", technique="a", success=True) # Pooled estimate = (10 + 0 + 1) / (10 + 1 + 1) = 11/12. assert selector.success_rate(context="ctx", technique="a") == pytest.approx(11 / 12) diff --git a/tests/unit/scenario/scenarios/adaptive/test_text_adaptive.py b/tests/unit/scenario/scenarios/adaptive/test_text_adaptive.py index 77176e03e..ff9556d65 100644 --- a/tests/unit/scenario/scenarios/adaptive/test_text_adaptive.py +++ b/tests/unit/scenario/scenarios/adaptive/test_text_adaptive.py @@ -16,12 +16,11 @@ from pyrit.scenario.core.dataset_configuration import DatasetConfiguration from pyrit.scenario.core.scenario import BaselinePolicy from pyrit.scenario.scenarios.adaptive.dispatcher import ( - BANDIT_CONTEXT_LABEL, + ADAPTIVE_CONTEXT_LABEL, AdaptiveDispatchAttack, ) from pyrit.scenario.scenarios.adaptive.selector import ( GLOBAL_CONTEXT, - AdaptiveTechniqueSelector, harm_category_context, ) from pyrit.scenario.scenarios.adaptive.text_adaptive import TextAdaptive @@ -190,7 +189,6 @@ async def test_all_atomics_share_one_dispatcher(self, mock_objective_target, moc dispatchers = {atomic._attack_technique.attack for atomic in attacks} assert len(dispatchers) == 1 assert isinstance(next(iter(dispatchers)), AdaptiveDispatchAttack) - assert isinstance(scenario._selector, AdaptiveTechniqueSelector) async def test_global_context_label_when_using_global_extractor(self, mock_objective_target, mock_objective_scorer): groups = { @@ -203,7 +201,7 @@ async def test_global_context_label_when_using_global_extractor(self, mock_objec seed_groups=groups, ) for atomic in attacks: - assert atomic._memory_labels[BANDIT_CONTEXT_LABEL] == GLOBAL_CONTEXT + assert atomic._memory_labels[ADAPTIVE_CONTEXT_LABEL] == GLOBAL_CONTEXT async def test_harm_category_extractor_partitions_labels(self, mock_objective_target, mock_objective_scorer): groups = { @@ -217,7 +215,7 @@ async def test_harm_category_extractor_partitions_labels(self, mock_objective_ta seed_groups=groups, context_extractor=harm_category_context, ) - contexts = {atomic._memory_labels[BANDIT_CONTEXT_LABEL] for atomic in attacks} + contexts = {atomic._memory_labels[ADAPTIVE_CONTEXT_LABEL] for atomic in attacks} # Each objective gets its own context bucket from harm_category_context. assert contexts == {"violence", "hate", "_uncategorized"} From 3df57871cadd1d37573f9f1e442b40adb2cdc7d5 Mon Sep 17 00:00:00 2001 From: hannahwestra25 Date: Mon, 18 May 2026 18:29:56 -0400 Subject: [PATCH 04/12] pr review --- doc/code/scenarios/3_text_adaptive.ipynb | 44 +++++++++---------- doc/code/scenarios/3_text_adaptive.py | 2 +- doc/myst.yml | 1 + pyrit/scenario/scenarios/adaptive/selector.py | 2 +- .../scenarios/adaptive/text_adaptive.py | 10 ++--- .../scenarios/adaptive/test_selector.py | 5 ++- .../scenarios/adaptive/test_text_adaptive.py | 2 +- 7 files changed, 34 insertions(+), 32 deletions(-) diff --git a/doc/code/scenarios/3_text_adaptive.ipynb b/doc/code/scenarios/3_text_adaptive.ipynb index 474c1872d..3d692cb3a 100644 --- a/doc/code/scenarios/3_text_adaptive.ipynb +++ b/doc/code/scenarios/3_text_adaptive.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "33fbe4e9", + "id": "0", "metadata": {}, "source": [ "# TextAdaptive Scenario\n", @@ -39,7 +39,7 @@ }, { "cell_type": "markdown", - "id": "84bce821", + "id": "1", "metadata": {}, "source": [ "## Setup" @@ -48,16 +48,16 @@ { "cell_type": "code", "execution_count": null, - "id": "62eebc44", + "id": "2", "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", "\n", - "from pyrit.scenario.scenarios.adaptive import TextAdaptive, harm_category_context\n", "from pyrit.registry import TargetRegistry\n", "from pyrit.scenario import DatasetConfiguration\n", "from pyrit.scenario.printer.console_printer import ConsoleScenarioResultPrinter\n", + "from pyrit.scenario.scenarios.adaptive import TextAdaptive, harm_category_context\n", "from pyrit.setup import initialize_from_config_async\n", "\n", "await initialize_from_config_async(config_path=Path(\"../../scanner/pyrit_conf.yaml\")) # type: ignore\n", @@ -68,7 +68,7 @@ }, { "cell_type": "markdown", - "id": "93a67a83", + "id": "3", "metadata": {}, "source": [ "## Basic Usage\n", @@ -81,7 +81,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f74ce854", + "id": "4", "metadata": {}, "outputs": [], "source": [ @@ -96,7 +96,7 @@ }, { "cell_type": "markdown", - "id": "4bc43876", + "id": "5", "metadata": {}, "source": [ "## Customizing the Selector\n", @@ -112,7 +112,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ac0326e8", + "id": "6", "metadata": {}, "outputs": [], "source": [ @@ -129,7 +129,7 @@ }, { "cell_type": "markdown", - "id": "f0ff26ab", + "id": "7", "metadata": {}, "source": [ "### Max Attempts Per Objective\n", @@ -142,7 +142,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c638b3e5", + "id": "8", "metadata": {}, "outputs": [], "source": [ @@ -158,7 +158,7 @@ }, { "cell_type": "markdown", - "id": "3d9e697f", + "id": "9", "metadata": {}, "source": [ "## Context-Aware Selection\n", @@ -174,7 +174,7 @@ { "cell_type": "code", "execution_count": null, - "id": "954e9c65", + "id": "10", "metadata": {}, "outputs": [], "source": [ @@ -196,7 +196,7 @@ }, { "cell_type": "markdown", - "id": "c6319923", + "id": "11", "metadata": {}, "source": [ "The `pool_threshold` parameter controls how many local observations are needed before\n", @@ -207,7 +207,7 @@ }, { "cell_type": "markdown", - "id": "df56a4af", + "id": "12", "metadata": {}, "source": [ "## Strategy Selection\n", @@ -220,7 +220,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bbd3b91f", + "id": "13", "metadata": {}, "outputs": [], "source": [ @@ -234,7 +234,7 @@ }, { "cell_type": "markdown", - "id": "004bea90", + "id": "14", "metadata": {}, "source": [ "To limit the selector to only single-turn techniques:" @@ -243,7 +243,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f081a04d", + "id": "15", "metadata": {}, "outputs": [], "source": [ @@ -260,7 +260,7 @@ }, { "cell_type": "markdown", - "id": "bd48c512", + "id": "16", "metadata": {}, "source": [ "## Deterministic Runs\n", @@ -271,7 +271,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c022bc5c", + "id": "17", "metadata": {}, "outputs": [], "source": [ @@ -287,7 +287,7 @@ }, { "cell_type": "markdown", - "id": "0878f42e", + "id": "18", "metadata": {}, "source": [ "## Custom Scorer\n", @@ -299,7 +299,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e251342b", + "id": "19", "metadata": {}, "outputs": [], "source": [ @@ -321,7 +321,7 @@ }, { "cell_type": "markdown", - "id": "ba2bda21", + "id": "20", "metadata": {}, "source": [ "## Notes\n", diff --git a/doc/code/scenarios/3_text_adaptive.py b/doc/code/scenarios/3_text_adaptive.py index a5774753a..9a8cbfaa4 100644 --- a/doc/code/scenarios/3_text_adaptive.py +++ b/doc/code/scenarios/3_text_adaptive.py @@ -46,10 +46,10 @@ # %% from pathlib import Path -from pyrit.scenario.scenarios.adaptive import TextAdaptive, harm_category_context from pyrit.registry import TargetRegistry from pyrit.scenario import DatasetConfiguration from pyrit.scenario.printer.console_printer import ConsoleScenarioResultPrinter +from pyrit.scenario.scenarios.adaptive import TextAdaptive, harm_category_context from pyrit.setup import initialize_from_config_async await initialize_from_config_async(config_path=Path("../../scanner/pyrit_conf.yaml")) # type: ignore diff --git a/doc/myst.yml b/doc/myst.yml index f703d6c8c..5b4818377 100644 --- a/doc/myst.yml +++ b/doc/myst.yml @@ -168,6 +168,7 @@ project: children: - file: code/scenarios/1_common_scenario_parameters.ipynb - file: code/scenarios/2_custom_scenario_parameters.ipynb + - file: code/scenarios/3_adaptive_scenarios.ipynb - file: code/registry/0_registry.md children: - file: code/registry/1_class_registry.ipynb diff --git a/pyrit/scenario/scenarios/adaptive/selector.py b/pyrit/scenario/scenarios/adaptive/selector.py index 5f39b49c0..c5c9bc643 100644 --- a/pyrit/scenario/scenarios/adaptive/selector.py +++ b/pyrit/scenario/scenarios/adaptive/selector.py @@ -54,7 +54,7 @@ def harm_category_context(seed_attack_group: SeedAttackGroup) -> str: categories = seed_attack_group.harm_categories if not categories: return UNCATEGORIZED_CONTEXT - return categories[0] + return sorted(categories)[0] class AdaptiveTechniqueSelector: diff --git a/pyrit/scenario/scenarios/adaptive/text_adaptive.py b/pyrit/scenario/scenarios/adaptive/text_adaptive.py index fe33ea8fc..88885fef9 100644 --- a/pyrit/scenario/scenarios/adaptive/text_adaptive.py +++ b/pyrit/scenario/scenarios/adaptive/text_adaptive.py @@ -23,7 +23,7 @@ import logging import random import uuid -from typing import TYPE_CHECKING, ClassVar, cast +from typing import TYPE_CHECKING, Any, ClassVar, cast from pyrit.common import apply_defaults from pyrit.executor.attack import AttackScoringConfig @@ -43,7 +43,7 @@ if TYPE_CHECKING: from pyrit.executor.attack.core.attack_strategy import AttackStrategy - from pyrit.models import SeedAttackGroup + from pyrit.models import AttackResult, SeedAttackGroup from pyrit.scenario.core.atomic_attack import AtomicAttack from pyrit.score import TrueFalseScorer @@ -197,7 +197,7 @@ async def _get_atomic_attacks_async(self) -> list[AtomicAttack]: # since the dispatcher cannot merge technique seeds into the objective's # seed group at dispatch time. scoring_config = AttackScoringConfig(objective_scorer=cast("TrueFalseScorer", self._objective_scorer)) - techniques: dict[str, AttackStrategy] = {} + techniques: dict[str, AttackStrategy[Any, AttackResult]] = {} for technique_name in selected_techniques: factory = factories.get(technique_name) if factory is None: @@ -256,7 +256,7 @@ def _build_atomic_for_seed_group( from pyrit.scenario.core.atomic_attack import AtomicAttack from pyrit.scenario.core.attack_technique import AttackTechnique - bandit_context = self._context_extractor(seed_group) + adaptive_context = self._context_extractor(seed_group) # Use the objective's id when available so resume keys are stable across # runs that re-fetch the same seed groups; fall back to a random uuid. objective_id = seed_group.objective.id if seed_group.objective.id else uuid.uuid4() @@ -264,7 +264,7 @@ def _build_atomic_for_seed_group( memory_labels = { **self._memory_labels, - ADAPTIVE_CONTEXT_LABEL: bandit_context, + ADAPTIVE_CONTEXT_LABEL: adaptive_context, } return AtomicAttack( atomic_attack_name=atomic_attack_name, diff --git a/tests/unit/scenario/scenarios/adaptive/test_selector.py b/tests/unit/scenario/scenarios/adaptive/test_selector.py index 4766c16f0..ab6aae03e 100644 --- a/tests/unit/scenario/scenarios/adaptive/test_selector.py +++ b/tests/unit/scenario/scenarios/adaptive/test_selector.py @@ -168,10 +168,11 @@ def test_global_context_is_constant(self): sg = MagicMock() assert global_context(sg) == GLOBAL_CONTEXT - def test_harm_category_context_uses_first_category(self): + def test_harm_category_context_uses_sorted_first_category(self): sg = MagicMock() sg.harm_categories = ["violence", "hate"] - assert harm_category_context(sg) == "violence" + # sorted() ensures deterministic selection regardless of set iteration order + assert harm_category_context(sg) == "hate" def test_harm_category_context_falls_back_when_empty(self): sg = MagicMock() diff --git a/tests/unit/scenario/scenarios/adaptive/test_text_adaptive.py b/tests/unit/scenario/scenarios/adaptive/test_text_adaptive.py index ff9556d65..c67abc0fc 100644 --- a/tests/unit/scenario/scenarios/adaptive/test_text_adaptive.py +++ b/tests/unit/scenario/scenarios/adaptive/test_text_adaptive.py @@ -117,7 +117,7 @@ def test_get_default_strategy(self): assert strat is not None @patch("pyrit.scenario.core.scenario.Scenario._get_default_objective_scorer") - def test_init_stores_bandit_params(self, mock_get_scorer, mock_objective_scorer): + def test_init_stores_adaptive_params(self, mock_get_scorer, mock_objective_scorer): mock_get_scorer.return_value = mock_objective_scorer scenario = TextAdaptive( epsilon=0.4, From b794db05edd0c4e2b30fabc7cd93fa342e8237ee Mon Sep 17 00:00:00 2001 From: hannahwestra25 Date: Tue, 19 May 2026 12:59:08 -0400 Subject: [PATCH 05/12] generalize and clean up comments & notebooks --- doc/code/scenarios/3_adaptive_scenarios.ipynb | 260 +++++++++++++ doc/code/scenarios/3_adaptive_scenarios.py | 164 +++++++++ doc/code/scenarios/3_text_adaptive.ipynb | 345 ------------------ doc/code/scenarios/3_text_adaptive.py | 220 ----------- pyrit/scenario/scenarios/adaptive/__init__.py | 2 + .../scenarios/adaptive/adaptive_scenario.py | 278 ++++++++++++++ .../scenario/scenarios/adaptive/dispatcher.py | 93 +++-- pyrit/scenario/scenarios/adaptive/selector.py | 171 ++++----- .../scenarios/adaptive/text_adaptive.py | 212 ++--------- .../scenarios/adaptive/test_dispatcher.py | 33 ++ .../scenarios/adaptive/test_selector.py | 44 ++- .../scenarios/adaptive/test_text_adaptive.py | 119 ++++++ 12 files changed, 1039 insertions(+), 902 deletions(-) create mode 100644 doc/code/scenarios/3_adaptive_scenarios.ipynb create mode 100644 doc/code/scenarios/3_adaptive_scenarios.py delete mode 100644 doc/code/scenarios/3_text_adaptive.ipynb delete mode 100644 doc/code/scenarios/3_text_adaptive.py create mode 100644 pyrit/scenario/scenarios/adaptive/adaptive_scenario.py diff --git a/doc/code/scenarios/3_adaptive_scenarios.ipynb b/doc/code/scenarios/3_adaptive_scenarios.ipynb new file mode 100644 index 000000000..93938c1d6 --- /dev/null +++ b/doc/code/scenarios/3_adaptive_scenarios.ipynb @@ -0,0 +1,260 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0", + "metadata": {}, + "source": [ + "# Adaptive Scenarios\n", + "\n", + "An **adaptive scenario** doesn't run every attack technique against every objective.\n", + "Instead, it picks which technique to try next per-objective, learns from what worked,\n", + "and stops as soon as one technique succeeds. This concentrates spend on techniques\n", + "that actually work on your target.\n", + "\n", + "## How it works (high level)\n", + "\n", + "For each objective, the scenario tries up to `max_attempts_per_objective` techniques:\n", + "\n", + "- With probability `epsilon`, it **explores** — picks a random technique.\n", + "- Otherwise it **exploits** — picks the technique with the highest observed success\n", + " rate so far.\n", + "- It records the outcome and stops early on success.\n", + "\n", + "Unseen techniques are tried first, so the first few objectives effectively round-robin\n", + "through every technique before the scenario settles on the best performers.\n", + "\n", + "## Adaptive vs. static scenarios\n", + "\n", + "| Feature | Static scenarios | Adaptive scenarios |\n", + "|---------------------|-----------------------------------|------------------------------------|\n", + "| Technique selection | Run every selected technique | Pick per-objective from outcomes |\n", + "| Early stopping | No | Yes — stops on first success |\n", + "| Cost | O(techniques × objectives) | O(max_attempts × objectives) |\n", + "\n", + "`AdaptiveScenario` is the modality-agnostic base class.\n", + "[`TextAdaptive`](../../../pyrit/scenario/scenarios/adaptive/text_adaptive.py) is the\n", + "text subclass used in the examples below." + ] + }, + { + "cell_type": "markdown", + "id": "1", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "from pyrit.registry import TargetRegistry\n", + "from pyrit.scenario import DatasetConfiguration\n", + "from pyrit.scenario.printer.console_printer import ConsoleScenarioResultPrinter\n", + "from pyrit.scenario.scenarios.adaptive import TextAdaptive, harm_category_context\n", + "from pyrit.setup import initialize_from_config_async\n", + "\n", + "await initialize_from_config_async(config_path=Path(\"../../scanner/pyrit_conf.yaml\")) # type: ignore\n", + "\n", + "objective_target = TargetRegistry.get_registry_singleton().get_instance_by_name(\"openai_chat\")\n", + "printer = ConsoleScenarioResultPrinter()" + ] + }, + { + "cell_type": "markdown", + "id": "3", + "metadata": {}, + "source": [ + "## Basic usage\n", + "\n", + "Defaults: `epsilon=0.2`, `max_attempts_per_objective=3`, the subclass's default datasets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4", + "metadata": {}, + "outputs": [], + "source": [ + "scenario = TextAdaptive()\n", + "\n", + "await scenario.initialize_async( # type: ignore\n", + " objective_target=objective_target,\n", + ")\n", + "result = await scenario.run_async() # type: ignore\n", + "await printer.print_summary_async(result) # type: ignore" + ] + }, + { + "cell_type": "markdown", + "id": "5", + "metadata": {}, + "source": [ + "## Tuning exploration (`epsilon`)\n", + "\n", + "- `epsilon=0.0` — pure exploitation (always pick the best-known technique).\n", + "- `epsilon=1.0` — pure exploration (random every time).\n", + "- `epsilon=0.2` (default) — 20% exploration." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6", + "metadata": {}, + "outputs": [], + "source": [ + "explorative_scenario = TextAdaptive(epsilon=0.5)\n", + "\n", + "await explorative_scenario.initialize_async( # type: ignore\n", + " objective_target=objective_target,\n", + " dataset_config=DatasetConfiguration(dataset_names=[\"airt_hate\"], max_dataset_size=4),\n", + ")\n", + "explorative_result = await explorative_scenario.run_async() # type: ignore\n", + "await printer.print_summary_async(explorative_result) # type: ignore" + ] + }, + { + "cell_type": "markdown", + "id": "7", + "metadata": {}, + "source": [ + "## Attempts per objective\n", + "\n", + "`max_attempts_per_objective` caps how many techniques are tried per objective before\n", + "moving on. Higher = more chances to succeed, more API calls." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8", + "metadata": {}, + "outputs": [], + "source": [ + "persistent_scenario = TextAdaptive(max_attempts_per_objective=5)\n", + "\n", + "await persistent_scenario.initialize_async( # type: ignore\n", + " objective_target=objective_target,\n", + " dataset_config=DatasetConfiguration(dataset_names=[\"airt_violence\"], max_dataset_size=4),\n", + ")\n", + "persistent_result = await persistent_scenario.run_async() # type: ignore\n", + "await printer.print_summary_async(persistent_result) # type: ignore" + ] + }, + { + "cell_type": "markdown", + "id": "9", + "metadata": {}, + "source": [ + "## Learning per harm category\n", + "\n", + "By default, the scenario keeps one global success-rate table — what works on hate\n", + "objectives boosts the same technique on violence objectives. Pass `harm_category_context`\n", + "to learn each category independently:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10", + "metadata": {}, + "outputs": [], + "source": [ + "contextual_scenario = TextAdaptive(context_extractor=harm_category_context)\n", + "\n", + "await contextual_scenario.initialize_async( # type: ignore\n", + " objective_target=objective_target,\n", + " dataset_config=DatasetConfiguration(\n", + " dataset_names=[\"airt_hate\", \"airt_violence\"],\n", + " max_dataset_size=4,\n", + " ),\n", + ")\n", + "contextual_result = await contextual_scenario.run_async() # type: ignore\n", + "await printer.print_summary_async(contextual_result) # type: ignore" + ] + }, + { + "cell_type": "markdown", + "id": "11", + "metadata": {}, + "source": [ + "## Restricting which techniques participate\n", + "\n", + "Use `scenario_strategies` to limit which techniques the scenario can pick from." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12", + "metadata": {}, + "outputs": [], + "source": [ + "strategy_class = TextAdaptive.get_strategy_class()\n", + "\n", + "single_turn_scenario = TextAdaptive()\n", + "\n", + "await single_turn_scenario.initialize_async( # type: ignore\n", + " objective_target=objective_target,\n", + " scenario_strategies=[strategy_class(\"single_turn\")],\n", + " dataset_config=DatasetConfiguration(dataset_names=[\"airt_hate\"], max_dataset_size=4),\n", + ")\n", + "single_turn_result = await single_turn_scenario.run_async() # type: ignore\n", + "await printer.print_summary_async(single_turn_result) # type: ignore" + ] + }, + { + "cell_type": "markdown", + "id": "13", + "metadata": {}, + "source": [ + "## Reproducible runs\n", + "\n", + "Pass `seed` to make every selection decision deterministic." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14", + "metadata": {}, + "outputs": [], + "source": [ + "deterministic_scenario = TextAdaptive(seed=42, epsilon=0.3)\n", + "\n", + "await deterministic_scenario.initialize_async( # type: ignore\n", + " objective_target=objective_target,\n", + " dataset_config=DatasetConfiguration(dataset_names=[\"airt_hate\"], max_dataset_size=2),\n", + ")\n", + "deterministic_result = await deterministic_scenario.run_async() # type: ignore\n", + "await printer.print_summary_async(deterministic_result) # type: ignore" + ] + }, + { + "cell_type": "markdown", + "id": "15", + "metadata": {}, + "source": [ + "## Resuming a run\n", + "\n", + "Adaptive scenarios are resumable — pass `scenario_result_id=...` to `initialize_async`\n", + "and the run picks up where it left off, with prior outcomes replayed into the selector." + ] + } + ], + "metadata": { + "jupytext": { + "main_language": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/doc/code/scenarios/3_adaptive_scenarios.py b/doc/code/scenarios/3_adaptive_scenarios.py new file mode 100644 index 000000000..27a4cffbd --- /dev/null +++ b/doc/code/scenarios/3_adaptive_scenarios.py @@ -0,0 +1,164 @@ +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.18.1 +# --- + +# %% [markdown] +# # Adaptive Scenarios +# +# An **adaptive scenario** doesn't run every attack technique against every objective. +# Instead, it picks which technique to try next per-objective, learns from what worked, +# and stops as soon as one technique succeeds. This concentrates spend on techniques +# that actually work on your target. +# +# ## How it works (high level) +# +# For each objective, the scenario tries up to `max_attempts_per_objective` techniques: +# +# - With probability `epsilon`, it **explores** — picks a random technique. +# - Otherwise it **exploits** — picks the technique with the highest observed success +# rate so far. +# - It records the outcome and stops early on success. +# +# Unseen techniques are tried first, so the first few objectives effectively round-robin +# through every technique before the scenario settles on the best performers. +# +# ## Adaptive vs. static scenarios +# +# | Feature | Static scenarios | Adaptive scenarios | +# |---------------------|-----------------------------------|------------------------------------| +# | Technique selection | Run every selected technique | Pick per-objective from outcomes | +# | Early stopping | No | Yes — stops on first success | +# | Cost | O(techniques × objectives) | O(max_attempts × objectives) | +# +# `AdaptiveScenario` is the modality-agnostic base class. +# [`TextAdaptive`](../../../pyrit/scenario/scenarios/adaptive/text_adaptive.py) is the +# text subclass used in the examples below. + +# %% [markdown] +# ## Setup + +# %% +from pathlib import Path + +from pyrit.registry import TargetRegistry +from pyrit.scenario import DatasetConfiguration +from pyrit.scenario.printer.console_printer import ConsoleScenarioResultPrinter +from pyrit.scenario.scenarios.adaptive import TextAdaptive, harm_category_context +from pyrit.setup import initialize_from_config_async + +await initialize_from_config_async(config_path=Path("../../scanner/pyrit_conf.yaml")) # type: ignore + +objective_target = TargetRegistry.get_registry_singleton().get_instance_by_name("openai_chat") +printer = ConsoleScenarioResultPrinter() + +# %% [markdown] +# ## Basic usage +# +# Defaults: `epsilon=0.2`, `max_attempts_per_objective=3`, the subclass's default datasets. + +# %% +scenario = TextAdaptive() + +await scenario.initialize_async( # type: ignore + objective_target=objective_target, +) +result = await scenario.run_async() # type: ignore +await printer.print_summary_async(result) # type: ignore + +# %% [markdown] +# ## Tuning exploration (`epsilon`) +# +# - `epsilon=0.0` — pure exploitation (always pick the best-known technique). +# - `epsilon=1.0` — pure exploration (random every time). +# - `epsilon=0.2` (default) — 20% exploration. + +# %% +explorative_scenario = TextAdaptive(epsilon=0.5) + +await explorative_scenario.initialize_async( # type: ignore + objective_target=objective_target, + dataset_config=DatasetConfiguration(dataset_names=["airt_hate"], max_dataset_size=4), +) +explorative_result = await explorative_scenario.run_async() # type: ignore +await printer.print_summary_async(explorative_result) # type: ignore + +# %% [markdown] +# ## Attempts per objective +# +# `max_attempts_per_objective` caps how many techniques are tried per objective before +# moving on. Higher = more chances to succeed, more API calls. + +# %% +persistent_scenario = TextAdaptive(max_attempts_per_objective=5) + +await persistent_scenario.initialize_async( # type: ignore + objective_target=objective_target, + dataset_config=DatasetConfiguration(dataset_names=["airt_violence"], max_dataset_size=4), +) +persistent_result = await persistent_scenario.run_async() # type: ignore +await printer.print_summary_async(persistent_result) # type: ignore + +# %% [markdown] +# ## Learning per harm category +# +# By default, the scenario keeps one global success-rate table — what works on hate +# objectives boosts the same technique on violence objectives. Pass `harm_category_context` +# to learn each category independently: + +# %% +contextual_scenario = TextAdaptive(context_extractor=harm_category_context) + +await contextual_scenario.initialize_async( # type: ignore + objective_target=objective_target, + dataset_config=DatasetConfiguration( + dataset_names=["airt_hate", "airt_violence"], + max_dataset_size=4, + ), +) +contextual_result = await contextual_scenario.run_async() # type: ignore +await printer.print_summary_async(contextual_result) # type: ignore + +# %% [markdown] +# ## Restricting which techniques participate +# +# Use `scenario_strategies` to limit which techniques the scenario can pick from. + +# %% +strategy_class = TextAdaptive.get_strategy_class() + +single_turn_scenario = TextAdaptive() + +await single_turn_scenario.initialize_async( # type: ignore + objective_target=objective_target, + scenario_strategies=[strategy_class("single_turn")], + dataset_config=DatasetConfiguration(dataset_names=["airt_hate"], max_dataset_size=4), +) +single_turn_result = await single_turn_scenario.run_async() # type: ignore +await printer.print_summary_async(single_turn_result) # type: ignore + +# %% [markdown] +# ## Reproducible runs +# +# Pass `seed` to make every selection decision deterministic. + +# %% +deterministic_scenario = TextAdaptive(seed=42, epsilon=0.3) + +await deterministic_scenario.initialize_async( # type: ignore + objective_target=objective_target, + dataset_config=DatasetConfiguration(dataset_names=["airt_hate"], max_dataset_size=2), +) +deterministic_result = await deterministic_scenario.run_async() # type: ignore +await printer.print_summary_async(deterministic_result) # type: ignore + +# %% [markdown] +# ## Resuming a run +# +# Adaptive scenarios are resumable — pass `scenario_result_id=...` to `initialize_async` +# and the run picks up where it left off, with prior outcomes replayed into the selector. diff --git a/doc/code/scenarios/3_text_adaptive.ipynb b/doc/code/scenarios/3_text_adaptive.ipynb deleted file mode 100644 index 3d692cb3a..000000000 --- a/doc/code/scenarios/3_text_adaptive.ipynb +++ /dev/null @@ -1,345 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "0", - "metadata": {}, - "source": [ - "# TextAdaptive Scenario\n", - "\n", - "The `TextAdaptive` scenario uses an **epsilon-greedy selector** to intelligently choose\n", - "which attack technique to try for each objective. Unlike static scenarios that run every\n", - "selected technique against every objective, `TextAdaptive` adapts its strategy selection\n", - "based on observed success rates — spending more attempts on techniques that work and\n", - "exploring new ones with a configurable probability.\n", - "\n", - "## How It Works\n", - "\n", - "For each objective (prompt), the selector:\n", - "\n", - "1. **Explores** with probability `epsilon` — picks a technique uniformly at random.\n", - "2. **Exploits** otherwise — picks the technique with the highest observed success rate.\n", - "3. **Stops early** when a technique succeeds, avoiding wasted attempts.\n", - "4. Tries **up to** `max_attempts_per_objective` techniques before moving on.\n", - "\n", - "Unseen techniques start with an optimistic prior (100% success estimate), so the first\n", - "few objectives effectively round-robin through every available technique before the\n", - "selector converges on the best performers.\n", - "\n", - "## Key Differences from Static Scenarios\n", - "\n", - "| Feature | Static Scenarios | TextAdaptive |\n", - "|---------|-----------------|--------------|\n", - "| Technique selection | Run all selected techniques | Selector picks per-objective |\n", - "| Early stopping | No | Yes — stops on first success |\n", - "| Learning | None | Updates success rates after each attempt |\n", - "| Baseline | Prepended automatically | Forbidden — `prompt_sending` is a technique |\n", - "| Efficiency | O(techniques × objectives) | O(max_attempts × objectives) |" - ] - }, - { - "cell_type": "markdown", - "id": "1", - "metadata": {}, - "source": [ - "## Setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2", - "metadata": {}, - "outputs": [], - "source": [ - "from pathlib import Path\n", - "\n", - "from pyrit.registry import TargetRegistry\n", - "from pyrit.scenario import DatasetConfiguration\n", - "from pyrit.scenario.printer.console_printer import ConsoleScenarioResultPrinter\n", - "from pyrit.scenario.scenarios.adaptive import TextAdaptive, harm_category_context\n", - "from pyrit.setup import initialize_from_config_async\n", - "\n", - "await initialize_from_config_async(config_path=Path(\"../../scanner/pyrit_conf.yaml\")) # type: ignore\n", - "\n", - "objective_target = TargetRegistry.get_registry_singleton().get_instance_by_name(\"openai_chat\")\n", - "printer = ConsoleScenarioResultPrinter()" - ] - }, - { - "cell_type": "markdown", - "id": "3", - "metadata": {}, - "source": [ - "## Basic Usage\n", - "\n", - "The simplest way to run `TextAdaptive` uses all defaults: the selector explores with 20%\n", - "probability, tries up to 3 techniques per objective, and uses the default dataset\n", - "(AIRT harm categories)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4", - "metadata": {}, - "outputs": [], - "source": [ - "scenario = TextAdaptive()\n", - "\n", - "await scenario.initialize_async( # type: ignore\n", - " objective_target=objective_target,\n", - ")\n", - "result = await scenario.run_async() # type: ignore\n", - "await printer.print_summary_async(result) # type: ignore" - ] - }, - { - "cell_type": "markdown", - "id": "5", - "metadata": {}, - "source": [ - "## Customizing the Selector\n", - "\n", - "### Epsilon (Exploration Rate)\n", - "\n", - "`epsilon` controls how often the selector explores vs. exploits:\n", - "- `epsilon=0.0` — pure exploitation (always pick the best-known technique)\n", - "- `epsilon=1.0` — pure exploration (random selection every time)\n", - "- `epsilon=0.2` (default) — 20% random exploration, 80% exploitation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6", - "metadata": {}, - "outputs": [], - "source": [ - "# More explorative selector — useful when you want broader technique coverage\n", - "explorative_scenario = TextAdaptive(epsilon=0.5)\n", - "\n", - "await explorative_scenario.initialize_async( # type: ignore\n", - " objective_target=objective_target,\n", - " dataset_config=DatasetConfiguration(dataset_names=[\"airt_hate\"], max_dataset_size=4),\n", - ")\n", - "explorative_result = await explorative_scenario.run_async() # type: ignore\n", - "await printer.print_summary_async(explorative_result) # type: ignore" - ] - }, - { - "cell_type": "markdown", - "id": "7", - "metadata": {}, - "source": [ - "### Max Attempts Per Objective\n", - "\n", - "`max_attempts_per_objective` caps how many techniques the selector tries before giving\n", - "up on an objective. Setting this higher gives more chances to succeed but costs more\n", - "API calls." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8", - "metadata": {}, - "outputs": [], - "source": [ - "persistent_scenario = TextAdaptive(max_attempts_per_objective=5)\n", - "\n", - "await persistent_scenario.initialize_async( # type: ignore\n", - " objective_target=objective_target,\n", - " dataset_config=DatasetConfiguration(dataset_names=[\"airt_violence\"], max_dataset_size=4),\n", - ")\n", - "persistent_result = await persistent_scenario.run_async() # type: ignore\n", - "await printer.print_summary_async(persistent_result) # type: ignore" - ] - }, - { - "cell_type": "markdown", - "id": "9", - "metadata": {}, - "source": [ - "## Context-Aware Selection\n", - "\n", - "By default, the selector shares one global table across all objectives. This means\n", - "a technique that works well on hate-speech objectives also gets boosted for\n", - "violence objectives.\n", - "\n", - "To partition the selector by harm category (so each category learns independently),\n", - "pass `harm_category_context` as the `context_extractor`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "10", - "metadata": {}, - "outputs": [], - "source": [ - "contextual_scenario = TextAdaptive(\n", - " context_extractor=harm_category_context,\n", - " pool_threshold=2,\n", - ")\n", - "\n", - "await contextual_scenario.initialize_async( # type: ignore\n", - " objective_target=objective_target,\n", - " dataset_config=DatasetConfiguration(\n", - " dataset_names=[\"airt_hate\", \"airt_violence\"],\n", - " max_dataset_size=4,\n", - " ),\n", - ")\n", - "contextual_result = await contextual_scenario.run_async() # type: ignore\n", - "await printer.print_summary_async(contextual_result) # type: ignore" - ] - }, - { - "cell_type": "markdown", - "id": "11", - "metadata": {}, - "source": [ - "The `pool_threshold` parameter controls how many local observations are needed before\n", - "the per-category estimate overrides the pooled-global estimate. With\n", - "`pool_threshold=2`, the selector uses the global average until it has seen at least 2\n", - "results for a specific (category, technique) pair." - ] - }, - { - "cell_type": "markdown", - "id": "12", - "metadata": {}, - "source": [ - "## Strategy Selection\n", - "\n", - "`TextAdaptive` builds its strategy enum dynamically from the scenario-techniques\n", - "catalog. You can restrict which techniques participate using the\n", - "`scenario_strategies` parameter:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "13", - "metadata": {}, - "outputs": [], - "source": [ - "strategy_class = TextAdaptive.get_strategy_class()\n", - "\n", - "# See all available strategies\n", - "print(\"Available strategies:\")\n", - "for member in strategy_class:\n", - " print(f\" {member.value}\")" - ] - }, - { - "cell_type": "markdown", - "id": "14", - "metadata": {}, - "source": [ - "To limit the selector to only single-turn techniques:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "15", - "metadata": {}, - "outputs": [], - "source": [ - "single_turn_scenario = TextAdaptive()\n", - "\n", - "await single_turn_scenario.initialize_async( # type: ignore\n", - " objective_target=objective_target,\n", - " scenario_strategies=[strategy_class(\"single_turn\")],\n", - " dataset_config=DatasetConfiguration(dataset_names=[\"airt_hate\"], max_dataset_size=4),\n", - ")\n", - "single_turn_result = await single_turn_scenario.run_async() # type: ignore\n", - "await printer.print_summary_async(single_turn_result) # type: ignore" - ] - }, - { - "cell_type": "markdown", - "id": "16", - "metadata": {}, - "source": [ - "## Deterministic Runs\n", - "\n", - "For reproducibility, pass a `seed` to make the selector's random decisions deterministic:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "17", - "metadata": {}, - "outputs": [], - "source": [ - "deterministic_scenario = TextAdaptive(seed=42, epsilon=0.3)\n", - "\n", - "await deterministic_scenario.initialize_async( # type: ignore\n", - " objective_target=objective_target,\n", - " dataset_config=DatasetConfiguration(dataset_names=[\"airt_hate\"], max_dataset_size=2),\n", - ")\n", - "deterministic_result = await deterministic_scenario.run_async() # type: ignore\n", - "await printer.print_summary_async(deterministic_result) # type: ignore" - ] - }, - { - "cell_type": "markdown", - "id": "18", - "metadata": {}, - "source": [ - "## Custom Scorer\n", - "\n", - "By default, `TextAdaptive` uses the standard composite scorer. You can override it\n", - "with any `TrueFalseScorer`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "19", - "metadata": {}, - "outputs": [], - "source": [ - "from pyrit.prompt_target import OpenAIChatTarget\n", - "from pyrit.score import SelfAskRefusalScorer, TrueFalseInverterScorer\n", - "\n", - "refusal_scorer = SelfAskRefusalScorer(chat_target=OpenAIChatTarget())\n", - "inverted_scorer = TrueFalseInverterScorer(scorer=refusal_scorer)\n", - "\n", - "custom_scorer_scenario = TextAdaptive(objective_scorer=inverted_scorer)\n", - "\n", - "await custom_scorer_scenario.initialize_async( # type: ignore\n", - " objective_target=objective_target,\n", - " dataset_config=DatasetConfiguration(dataset_names=[\"airt_hate\"], max_dataset_size=2),\n", - ")\n", - "custom_result = await custom_scorer_scenario.run_async() # type: ignore\n", - "await printer.print_summary_async(custom_result) # type: ignore" - ] - }, - { - "cell_type": "markdown", - "id": "20", - "metadata": {}, - "source": [ - "## Notes\n", - "\n", - "- **No baseline**: `TextAdaptive` has `BASELINE_POLICY = Forbidden`. The `prompt_sending`\n", - " technique participates as one of the selector's techniques, so a separate baseline is redundant.\n", - "- **Resumability**: Each atomic attack is keyed by `adaptive_{dataset}_{objective_id}`, so\n", - " re-running a scenario picks up where it left off.\n", - "- **Shared selector**: All objectives in a run share the same `AdaptiveTechniqueSelector`\n", - " instance, so learning from one objective immediately benefits the next." - ] - } - ], - "metadata": { - "jupytext": { - "main_language": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/doc/code/scenarios/3_text_adaptive.py b/doc/code/scenarios/3_text_adaptive.py deleted file mode 100644 index 9a8cbfaa4..000000000 --- a/doc/code/scenarios/3_text_adaptive.py +++ /dev/null @@ -1,220 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.18.1 -# --- - -# %% [markdown] -# # TextAdaptive Scenario -# -# The `TextAdaptive` scenario uses an **epsilon-greedy selector** to intelligently choose -# which attack technique to try for each objective. Unlike static scenarios that run every -# selected technique against every objective, `TextAdaptive` adapts its strategy selection -# based on observed success rates — spending more attempts on techniques that work and -# exploring new ones with a configurable probability. -# -# ## How It Works -# -# For each objective (prompt), the selector: -# -# 1. **Explores** with probability `epsilon` — picks a technique uniformly at random. -# 2. **Exploits** otherwise — picks the technique with the highest observed success rate. -# 3. **Stops early** when a technique succeeds, avoiding wasted attempts. -# 4. Tries **up to** `max_attempts_per_objective` techniques before moving on. -# -# Unseen techniques start with an optimistic prior (100% success estimate), so the first -# few objectives effectively round-robin through every available technique before the -# selector converges on the best performers. -# -# ## Key Differences from Static Scenarios -# -# | Feature | Static Scenarios | TextAdaptive | -# |---------|-----------------|--------------| -# | Technique selection | Run all selected techniques | Selector picks per-objective | -# | Early stopping | No | Yes — stops on first success | -# | Learning | None | Updates success rates after each attempt | -# | Baseline | Prepended automatically | Forbidden — `prompt_sending` is a technique | -# | Efficiency | O(techniques × objectives) | O(max_attempts × objectives) | - -# %% [markdown] -# ## Setup - -# %% -from pathlib import Path - -from pyrit.registry import TargetRegistry -from pyrit.scenario import DatasetConfiguration -from pyrit.scenario.printer.console_printer import ConsoleScenarioResultPrinter -from pyrit.scenario.scenarios.adaptive import TextAdaptive, harm_category_context -from pyrit.setup import initialize_from_config_async - -await initialize_from_config_async(config_path=Path("../../scanner/pyrit_conf.yaml")) # type: ignore - -objective_target = TargetRegistry.get_registry_singleton().get_instance_by_name("openai_chat") -printer = ConsoleScenarioResultPrinter() - -# %% [markdown] -# ## Basic Usage -# -# The simplest way to run `TextAdaptive` uses all defaults: the selector explores with 20% -# probability, tries up to 3 techniques per objective, and uses the default dataset -# (AIRT harm categories). - -# %% -scenario = TextAdaptive() - -await scenario.initialize_async( # type: ignore - objective_target=objective_target, -) -result = await scenario.run_async() # type: ignore -await printer.print_summary_async(result) # type: ignore - -# %% [markdown] -# ## Customizing the Selector -# -# ### Epsilon (Exploration Rate) -# -# `epsilon` controls how often the selector explores vs. exploits: -# - `epsilon=0.0` — pure exploitation (always pick the best-known technique) -# - `epsilon=1.0` — pure exploration (random selection every time) -# - `epsilon=0.2` (default) — 20% random exploration, 80% exploitation - -# %% -# More explorative selector — useful when you want broader technique coverage -explorative_scenario = TextAdaptive(epsilon=0.5) - -await explorative_scenario.initialize_async( # type: ignore - objective_target=objective_target, - dataset_config=DatasetConfiguration(dataset_names=["airt_hate"], max_dataset_size=4), -) -explorative_result = await explorative_scenario.run_async() # type: ignore -await printer.print_summary_async(explorative_result) # type: ignore - -# %% [markdown] -# ### Max Attempts Per Objective -# -# `max_attempts_per_objective` caps how many techniques the selector tries before giving -# up on an objective. Setting this higher gives more chances to succeed but costs more -# API calls. - -# %% -persistent_scenario = TextAdaptive(max_attempts_per_objective=5) - -await persistent_scenario.initialize_async( # type: ignore - objective_target=objective_target, - dataset_config=DatasetConfiguration(dataset_names=["airt_violence"], max_dataset_size=4), -) -persistent_result = await persistent_scenario.run_async() # type: ignore -await printer.print_summary_async(persistent_result) # type: ignore - -# %% [markdown] -# ## Context-Aware Selection -# -# By default, the selector shares one global table across all objectives. This means -# a technique that works well on hate-speech objectives also gets boosted for -# violence objectives. -# -# To partition the selector by harm category (so each category learns independently), -# pass `harm_category_context` as the `context_extractor`: - -# %% -contextual_scenario = TextAdaptive( - context_extractor=harm_category_context, - pool_threshold=2, -) - -await contextual_scenario.initialize_async( # type: ignore - objective_target=objective_target, - dataset_config=DatasetConfiguration( - dataset_names=["airt_hate", "airt_violence"], - max_dataset_size=4, - ), -) -contextual_result = await contextual_scenario.run_async() # type: ignore -await printer.print_summary_async(contextual_result) # type: ignore - -# %% [markdown] -# The `pool_threshold` parameter controls how many local observations are needed before -# the per-category estimate overrides the pooled-global estimate. With -# `pool_threshold=2`, the selector uses the global average until it has seen at least 2 -# results for a specific (category, technique) pair. - -# %% [markdown] -# ## Strategy Selection -# -# `TextAdaptive` builds its strategy enum dynamically from the scenario-techniques -# catalog. You can restrict which techniques participate using the -# `scenario_strategies` parameter: - -# %% -strategy_class = TextAdaptive.get_strategy_class() - -# See all available strategies -print("Available strategies:") -for member in strategy_class: - print(f" {member.value}") - -# %% [markdown] -# To limit the selector to only single-turn techniques: - -# %% -single_turn_scenario = TextAdaptive() - -await single_turn_scenario.initialize_async( # type: ignore - objective_target=objective_target, - scenario_strategies=[strategy_class("single_turn")], - dataset_config=DatasetConfiguration(dataset_names=["airt_hate"], max_dataset_size=4), -) -single_turn_result = await single_turn_scenario.run_async() # type: ignore -await printer.print_summary_async(single_turn_result) # type: ignore - -# %% [markdown] -# ## Deterministic Runs -# -# For reproducibility, pass a `seed` to make the selector's random decisions deterministic: - -# %% -deterministic_scenario = TextAdaptive(seed=42, epsilon=0.3) - -await deterministic_scenario.initialize_async( # type: ignore - objective_target=objective_target, - dataset_config=DatasetConfiguration(dataset_names=["airt_hate"], max_dataset_size=2), -) -deterministic_result = await deterministic_scenario.run_async() # type: ignore -await printer.print_summary_async(deterministic_result) # type: ignore - -# %% [markdown] -# ## Custom Scorer -# -# By default, `TextAdaptive` uses the standard composite scorer. You can override it -# with any `TrueFalseScorer`: - -# %% -from pyrit.prompt_target import OpenAIChatTarget -from pyrit.score import SelfAskRefusalScorer, TrueFalseInverterScorer - -refusal_scorer = SelfAskRefusalScorer(chat_target=OpenAIChatTarget()) -inverted_scorer = TrueFalseInverterScorer(scorer=refusal_scorer) - -custom_scorer_scenario = TextAdaptive(objective_scorer=inverted_scorer) - -await custom_scorer_scenario.initialize_async( # type: ignore - objective_target=objective_target, - dataset_config=DatasetConfiguration(dataset_names=["airt_hate"], max_dataset_size=2), -) -custom_result = await custom_scorer_scenario.run_async() # type: ignore -await printer.print_summary_async(custom_result) # type: ignore - -# %% [markdown] -# ## Notes -# -# - **No baseline**: `TextAdaptive` has `BASELINE_POLICY = Forbidden`. The `prompt_sending` -# technique participates as one of the selector's techniques, so a separate baseline is redundant. -# - **Resumability**: Each atomic attack is keyed by `adaptive_{dataset}_{objective_id}`, so -# re-running a scenario picks up where it left off. -# - **Shared selector**: All objectives in a run share the same `AdaptiveTechniqueSelector` -# instance, so learning from one objective immediately benefits the next. diff --git a/pyrit/scenario/scenarios/adaptive/__init__.py b/pyrit/scenario/scenarios/adaptive/__init__.py index 2fb58b888..d0bd978c2 100644 --- a/pyrit/scenario/scenarios/adaptive/__init__.py +++ b/pyrit/scenario/scenarios/adaptive/__init__.py @@ -3,6 +3,7 @@ """Adaptive scenario classes.""" +from pyrit.scenario.scenarios.adaptive.adaptive_scenario import AdaptiveScenario from pyrit.scenario.scenarios.adaptive.dispatcher import ( ADAPTIVE_CONTEXT_LABEL, AdaptiveDispatchAttack, @@ -18,6 +19,7 @@ __all__ = [ "ADAPTIVE_CONTEXT_LABEL", "AdaptiveDispatchAttack", + "AdaptiveScenario", "AdaptiveTechniqueSelector", "ContextExtractor", "TextAdaptive", diff --git a/pyrit/scenario/scenarios/adaptive/adaptive_scenario.py b/pyrit/scenario/scenarios/adaptive/adaptive_scenario.py new file mode 100644 index 000000000..b14cc28c4 --- /dev/null +++ b/pyrit/scenario/scenarios/adaptive/adaptive_scenario.py @@ -0,0 +1,278 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""``AdaptiveScenario`` — modality-agnostic base for scenarios that pick attack +techniques per-objective using an ``AdaptiveTechniqueSelector``. + +Owns selector wiring, dispatcher construction, per-objective atomic-attack +emission, and resume rehydration. Concrete subclasses (``TextAdaptive``, +future ``ImageAdaptive`` / ``AudioAdaptive``) only declare strategy class, +default datasets, version, and atomic-attack prefix. + +Baseline policy is ``Forbidden``: ``prompt_sending`` participates as one of +the selector's techniques rather than being prepended. +""" + +from __future__ import annotations + +import logging +import random +import uuid +from typing import TYPE_CHECKING, Any, ClassVar + +from pyrit.executor.attack import AttackScoringConfig +from pyrit.scenario.core.atomic_attack import AtomicAttack +from pyrit.scenario.core.attack_technique import AttackTechnique +from pyrit.scenario.core.scenario import BaselinePolicy, Scenario +from pyrit.scenario.scenarios.adaptive.dispatcher import ( + ADAPTIVE_CONTEXT_LABEL, + AdaptiveDispatchAttack, +) +from pyrit.scenario.scenarios.adaptive.selector import ( + AdaptiveTechniqueSelector, + ContextExtractor, + global_context, +) + +if TYPE_CHECKING: + from pyrit.executor.attack.core.attack_strategy import AttackStrategy + from pyrit.models import AttackResult, SeedAttackGroup + from pyrit.prompt_target import PromptTarget + from pyrit.score import TrueFalseScorer + +logger = logging.getLogger(__name__) + + +class AdaptiveScenario(Scenario): + """Abstract base for adaptive (epsilon-greedy) scenarios. + + Subclasses must implement the standard ``Scenario`` class-method overrides + and declare ``VERSION`` and ``_atomic_attack_prefix``. Selector wiring, + dispatcher construction, per-objective atomic-attack emission, and resume + rehydration are handled here. + """ + + BASELINE_POLICY: ClassVar[BaselinePolicy] = BaselinePolicy.Forbidden + + #: Subclasses must declare a scenario version for memory bookkeeping. + VERSION: ClassVar[int] + + #: Prefix for per-objective atomic-attack names (e.g. ``"adaptive_text"``). + _atomic_attack_prefix: ClassVar[str] = "adaptive" + + def __init__( + self, + *, + objective_scorer: TrueFalseScorer | None = None, + epsilon: float = 0.2, + pool_threshold: int = 3, + max_attempts_per_objective: int = 3, + seed: int | None = None, + context_extractor: ContextExtractor = global_context, + scenario_result_id: str | None = None, + ) -> None: + """ + Args: + objective_scorer (TrueFalseScorer | None): Scorer used to judge each + response. Defaults to the composite scorer from the base class. + epsilon (float): Exploration probability for the selector. Defaults to 0.2. + pool_threshold (int): Minimum per-(context, technique) attempts before + the local estimate overrides the pooled rate. Set to 1 to disable + pooling. Defaults to 3. + max_attempts_per_objective (int): Max techniques per objective. Defaults to 3. + seed (int | None): RNG seed for deterministic selection. Defaults to ``None``. + context_extractor (ContextExtractor): Maps a ``SeedAttackGroup`` to a + context key. Defaults to ``global_context``. + scenario_result_id (str | None): ID of an existing ``ScenarioResult`` to resume. + """ + if not objective_scorer: + objective_scorer = self._get_default_objective_scorer() + self._objective_scorer: TrueFalseScorer = objective_scorer + + self._epsilon = epsilon + self._pool_threshold = pool_threshold + self._max_attempts_per_objective = max_attempts_per_objective + self._seed = seed + self._context_extractor = context_extractor + + super().__init__( + version=self.VERSION, + strategy_class=self.get_strategy_class(), + objective_scorer=objective_scorer, + scenario_result_id=scenario_result_id, + ) + + async def _get_atomic_attacks_async(self) -> list[AtomicAttack]: + """Build one ``AtomicAttack`` per objective, all sharing a single + ``AdaptiveDispatchAttack`` (and therefore a single selector). + """ + if self._objective_target is None: + raise ValueError("objective_target must be set before creating attacks") + + techniques = self._build_techniques_dict(objective_target=self._objective_target) + + selector = AdaptiveTechniqueSelector( + epsilon=self._epsilon, + pool_threshold=self._pool_threshold, + rng=random.Random(self._seed), + ) + # On resume, replay prior attempt outcomes from persisted metadata. + self._rehydrate_selector_from_memory(selector=selector, known_techniques=set(techniques)) + + dispatcher = AdaptiveDispatchAttack( + objective_target=self._objective_target, + techniques=techniques, + selector=selector, + max_attempts_per_objective=self._max_attempts_per_objective, + ) + + seed_groups_by_dataset = self._dataset_config.get_seed_attack_groups() + atomic_attacks: list[AtomicAttack] = [] + for dataset_name, seed_groups in seed_groups_by_dataset.items(): + for seed_group in seed_groups: + atomic_attacks.append( + self._build_atomic_for_seed_group( + dataset_name=dataset_name, + seed_group=seed_group, + dispatcher=dispatcher, + ) + ) + + return atomic_attacks + + def _build_techniques_dict( + self, + *, + objective_target: PromptTarget, + ) -> dict[str, AttackStrategy[Any, AttackResult]]: + """Resolve selected strategies into a ``{name: inner_attack}`` map. + + Skips factories not registered for the current modality, and factories + whose technique requires a ``seed_technique`` (e.g. ``crescendo_simulated``) + — the dispatcher has no hook to merge technique seeds into per-objective + seed groups. + + Raises: + ValueError: If no techniques remain after filtering. Includes the + requested techniques and skip reasons. + """ + selected_techniques = sorted({s.value for s in self._scenario_strategies}) + factories = self._get_attack_technique_factories() + scoring_config = AttackScoringConfig(objective_scorer=self._objective_scorer) + + techniques: dict[str, AttackStrategy[Any, AttackResult]] = {} + skipped_seed_technique: list[str] = [] + skipped_no_factory: list[str] = [] + for technique_name in selected_techniques: + factory = factories.get(technique_name) + if factory is None: + skipped_no_factory.append(technique_name) + logger.warning(f"No factory for technique '{technique_name}', skipping.") + continue + technique = factory.create( + objective_target=objective_target, + attack_scoring_config=scoring_config, + ) + if technique.seed_technique is not None: + skipped_seed_technique.append(technique_name) + logger.warning( + "Skipping technique '%s': it requires a seed_technique which the adaptive " + "dispatcher cannot merge into per-objective seed groups. Use a static " + "scenario (e.g. RapidResponse) to run this technique.", + technique_name, + ) + continue + techniques[technique_name] = technique.attack + + if not techniques: + details: list[str] = [] + if skipped_seed_technique: + details.append(f"skipped (require seed_technique): {sorted(skipped_seed_technique)}") + if skipped_no_factory: + details.append(f"skipped (no factory registered): {sorted(skipped_no_factory)}") + suffix = f" ({'; '.join(details)})" if details else "" + raise ValueError( + f"{type(self).__name__}: no usable techniques after resolving strategies. " + f"Check the --strategies selection.{suffix}" + ) + + return techniques + + def _build_atomic_for_seed_group( + self, + *, + dataset_name: str, + seed_group: SeedAttackGroup, + dispatcher: AdaptiveDispatchAttack, + ) -> AtomicAttack: + adaptive_context = self._context_extractor(seed_group) + # Prefer the objective's id when available so resume keys stay stable + # across re-fetches of the same seed groups. + objective_id = seed_group.objective.id if seed_group.objective.id else uuid.uuid4() + atomic_attack_name = f"{self._atomic_attack_prefix}_{dataset_name}_{objective_id}" + + memory_labels = { + **self._memory_labels, + ADAPTIVE_CONTEXT_LABEL: adaptive_context, + } + return AtomicAttack( + atomic_attack_name=atomic_attack_name, + attack_technique=AttackTechnique(attack=dispatcher), + seed_groups=[seed_group], + objective_scorer=self._objective_scorer, + memory_labels=memory_labels, + display_group=dataset_name, + ) + + def _rehydrate_selector_from_memory( + self, + *, + selector: AdaptiveTechniqueSelector, + known_techniques: set[str], + ) -> None: + """Replay persisted dispatch trails into ``selector`` so resume + preserves learned state. + + Iterates every persisted ``AttackResult`` on the resumed + ``ScenarioResult`` and calls ``record_outcome`` once per attempt in + each ``metadata["adaptive_attempts"]`` trail. + + Args: + selector (AdaptiveTechniqueSelector): A freshly built selector to populate. + known_techniques (set[str]): Techniques available in the current run. + Trails referencing unknown techniques (e.g. after a strategies + change) are skipped so replay can't poison the table. + """ + if not self._scenario_result_id: + return + + try: + scenario_results = self._memory.get_scenario_results(scenario_result_ids=[self._scenario_result_id]) + except Exception as exc: + logger.warning(f"AdaptiveScenario: failed to load prior scenario result for rehydration: {exc}") + return + + if not scenario_results: + return + + replayed = 0 + for results_list in scenario_results[0].attack_results.values(): + for result in results_list: + trail = result.metadata.get("adaptive_attempts") if result.metadata else None + context = result.metadata.get("adaptive_context") if result.metadata else None + if not trail or not context: + continue + for step in trail: + technique = step.get("technique") + outcome = step.get("outcome") + if not technique or technique not in known_techniques: + continue + selector.record_outcome( + context=context, + technique=technique, + success=outcome == "success", + ) + replayed += 1 + + if replayed: + logger.info(f"AdaptiveScenario: rehydrated selector with {replayed} prior attempt(s).") diff --git a/pyrit/scenario/scenarios/adaptive/dispatcher.py b/pyrit/scenario/scenarios/adaptive/dispatcher.py index 9f6e99c27..6f682fddd 100644 --- a/pyrit/scenario/scenarios/adaptive/dispatcher.py +++ b/pyrit/scenario/scenarios/adaptive/dispatcher.py @@ -1,25 +1,18 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -""" -``AdaptiveDispatchAttack`` — an ``AttackStrategy`` that picks which inner -technique to run for each objective using an ``AdaptiveTechniqueSelector``. - -This is the execution-side counterpart to the selector. The selector decides -*which technique to try*; the dispatcher *runs the technique*, records the -outcome, and loops up to ``max_attempts_per_objective`` times. - -The dispatcher reads an adaptive-context key from -``context.memory_labels[ADAPTIVE_CONTEXT_LABEL]``. The scenario is expected to -stamp that label per-objective (computed once at atomic-attack construction -time via a ``ContextExtractor``). When the label is missing, the global -context is used. +"""``AdaptiveDispatchAttack`` — picks an inner technique per attempt via an +``AdaptiveTechniqueSelector``, runs it, records the outcome, and loops up to +``max_attempts_per_objective`` times. Reads the per-objective context key from +``context.memory_labels[ADAPTIVE_CONTEXT_LABEL]`` (falls back to the global context). """ from __future__ import annotations import logging -from dataclasses import dataclass +import uuid +from dataclasses import dataclass, replace +from datetime import datetime, timezone from typing import TYPE_CHECKING, Any from pyrit.executor.attack.core.attack_parameters import AttackParameters @@ -36,35 +29,30 @@ logger = logging.getLogger(__name__) -"""Memory-label key whose value is the adaptive context string for an objective.""" +# Memory-label keys stamped onto persisted prompt rows so adaptive attempts +# can be filtered/grouped after a run. The scenario stamps the context once +# per objective; the dispatcher stamps technique + attempt index on each try. ADAPTIVE_CONTEXT_LABEL: str = "_adaptive_context" - +"""Per-objective context key (e.g. ``"_global"`` or a harm category).""" ADAPTIVE_TECHNIQUE_LABEL: str = "_adaptive_technique" +"""Technique chosen by the dispatcher for a given attempt.""" ADAPTIVE_ATTEMPT_LABEL: str = "_adaptive_attempt" +"""1-based attempt index within the per-objective loop.""" @dataclass class AdaptiveDispatchContext(AttackContext[AttackParameters]): - """ - Execution context for ``AdaptiveDispatchAttack``. - - No extra state is needed beyond what ``AttackContext`` provides; the - dispatcher reads the objective and memory labels from the base class. - """ + """Execution context for ``AdaptiveDispatchAttack`` (no extra state).""" class AdaptiveDispatchAttack(AttackStrategy[AdaptiveDispatchContext, AttackResult]): - """ - Attack that delegates each attempt to one of several inner ``AttackStrategy`` - instances ("techniques"), choosing per attempt via an ``AdaptiveTechniqueSelector``. + """Attack that delegates each attempt to one of several inner techniques, + choosing per attempt via an ``AdaptiveTechniqueSelector``. - For each objective the dispatcher loops up to ``max_attempts_per_objective`` - times. On each iteration it asks the selector which technique to try, executes - the inner attack with the objective, records the outcome on the selector, - and stops early on success. - - The selector instance is **shared by reference** with the scenario, so - learning accumulates across all objectives in a run. + For each objective, loops up to ``max_attempts_per_objective`` times: + ask the selector, execute the chosen technique, record the outcome, and + stop early on success. The selector is shared by reference with the + scenario so learning accumulates across objectives. """ def __init__( @@ -77,17 +65,13 @@ def __init__( ) -> None: """ Args: - objective_target (PromptTarget): The target the inner attacks run against. - Stored for identifier/logging parity; the dispatcher does not call - the target directly. + objective_target (PromptTarget): The target inner attacks run against. + Stored for identifier/logging parity; not called directly. techniques (dict[str, AttackStrategy[Any, AttackResult]]): Mapping from technique name to a pre-built inner attack. Must be non-empty. - These are constructed by the scenario from registered attack - technique factories. - selector (AdaptiveTechniqueSelector): Shared adaptive selection state - that tracks per-technique success rates across objectives. - max_attempts_per_objective (int): Maximum number of technique attempts - per objective. Must be >= 1. Defaults to 3. + selector (AdaptiveTechniqueSelector): Shared selector state. + max_attempts_per_objective (int): Max attempts per objective; >= 1. + Defaults to 3. Raises: ValueError: If ``techniques`` is empty or ``max_attempts_per_objective`` < 1. @@ -154,11 +138,22 @@ async def _perform_async(self, *, context: AdaptiveDispatchContext) -> AttackRes if success: break - # ``max_attempts`` is validated >= 1 above, so the loop always runs at least once. - assert last_result is not None - last_result.metadata = { - **last_result.metadata, - "adaptive_attempts": trail, - "adaptive_context": adaptive_context, - } - return last_result + # ``max_attempts`` is validated >= 1, so the loop always runs at least + # once. Guard explicitly rather than with ``assert`` (stripped under -O). + if last_result is None: # pragma: no cover - defensive + raise RuntimeError("AdaptiveDispatchAttack ran zero attempts; this should be unreachable.") + # Return a fresh dispatcher-owned ``AttackResult``: the inner attack + # already persisted ``last_result`` via its own post-execute hook, so + # returning it directly would cause a PK conflict on the outer hook. + # ``dataclasses.replace`` copies every field; we override identity + # fields and stamp the trail onto metadata. + return replace( + last_result, + attack_result_id=str(uuid.uuid4()), + timestamp=datetime.now(timezone.utc), + metadata={ + **last_result.metadata, + "adaptive_attempts": trail, + "adaptive_context": adaptive_context, + }, + ) diff --git a/pyrit/scenario/scenarios/adaptive/selector.py b/pyrit/scenario/scenarios/adaptive/selector.py index c5c9bc643..0add29990 100644 --- a/pyrit/scenario/scenarios/adaptive/selector.py +++ b/pyrit/scenario/scenarios/adaptive/selector.py @@ -1,25 +1,12 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -""" -Adaptive technique selection for the ``TextAdaptive`` scenario. - -This module provides: - - ``AdaptiveTechniqueSelector``: an epsilon-greedy selector keyed by - ``(context, technique)`` that tracks successes/attempts per technique and - picks the next technique to try. - - ``ContextExtractor``: a callable alias for deriving a context string - from a ``SeedAttackGroup``, plus two ready-made extractors: - ``global_context`` (single bucket) and ``harm_category_context`` - (first harm category, falling back to ``"_uncategorized"``). - -The selector is intentionally I/O-free and synchronous; it holds a small -mutable table that lives for the duration of a single scenario run. -""" +"""Epsilon-greedy selector and context extractors for adaptive scenarios.""" from __future__ import annotations import random +import threading from collections.abc import Callable, Sequence from typing import TYPE_CHECKING @@ -28,64 +15,48 @@ ContextExtractor = Callable[["SeedAttackGroup"], str] -"""Maps a ``SeedAttackGroup`` to an adaptive context key (e.g. a harm category).""" +"""Maps a ``SeedAttackGroup`` to an adaptive context key.""" - -# Sentinel context keys used when no per-objective partitioning is desired -# or when a seed group lacks harm category metadata. GLOBAL_CONTEXT: str = "_global" -"""Default context key: all objectives share one selection table.""" +"""Default context: all objectives share one selection table.""" UNCATEGORIZED_CONTEXT: str = "_uncategorized" """Fallback context for seed groups with no harm category metadata.""" -# Context extractors are module-level functions so they can be passed directly -# as the ``context_extractor`` argument to ``TextAdaptive``. They implement the -# ``ContextExtractor`` callable protocol. - - def global_context(_seed_attack_group: SeedAttackGroup) -> str: - """Return a constant context so all objectives share one selection table.""" + """Return a single shared context for all objectives.""" return GLOBAL_CONTEXT def harm_category_context(seed_attack_group: SeedAttackGroup) -> str: - """Return the first harm category on the seed group, or a fallback.""" + """Return a context keyed by the sorted, ``|``-joined harm categories. + + Multi-category seeds form their own bucket; sorting makes the key deterministic. + Returns ``UNCATEGORIZED_CONTEXT`` when no categories are set. + """ categories = seed_attack_group.harm_categories if not categories: return UNCATEGORIZED_CONTEXT - return sorted(categories)[0] + return "|".join(sorted(categories)) class AdaptiveTechniqueSelector: - """ - Epsilon-greedy selector over attack techniques. - - The selector maintains a table of ``(context, technique) -> (successes, attempts)`` - counts. ``select`` returns the next technique to try for a given context, - and ``record_outcome`` records the outcome of an attempt. - - Selection uses epsilon-greedy with optimistic initialization: - - With probability ``epsilon``, pick uniformly at random from ``techniques``. - - Otherwise, pick the technique with the highest estimated success rate. - The estimate is ``(successes + 1) / (attempts + 1)`` (Laplace smoothing), - so unseen techniques start at 100% and are explored first via tiebreak. - - When a ``(context, technique)`` cell has fewer than ``pool_threshold`` attempts, - the estimate falls back to the pooled global rate for that technique across all - contexts. This lets per-context selectors benefit from cross-context data - until they have enough local samples. Set ``pool_threshold=1`` to disable - pooling (use the local estimate as soon as any attempt is recorded). - - Note: - This class is not thread/async safe. It assumes sequential calls, - which matches the base ``Scenario._execute_scenario_async`` loop - (same pattern as all other scenarios). + """Epsilon-greedy selector over attack techniques. + + Maintains a ``(context, technique) -> (successes, attempts)`` table. With + probability ``epsilon`` picks uniformly at random; otherwise picks the + technique with the highest Laplace-smoothed estimate ``(s + 1) / (n + 1)`` + (unseen techniques start at 1.0). A ``(context, technique)`` cell with + fewer than ``pool_threshold`` attempts falls back to the technique's + pooled rate across all contexts. + + All public methods are guarded by a ``threading.Lock`` so concurrent + callers cannot corrupt the table. The lock makes individual ops atomic, + not the overall select → execute → record sequence. """ - # Tolerance for floating-point comparison when tiebreaking in exploitation. - # Current estimates are exact rationals, but this guards against future - # estimator changes that may introduce floating-point drift. + # Tolerance for tiebreaking on float estimates (current estimates are exact + # rationals; this guards against future estimator changes). _TIE_TOL: float = 1e-12 def __init__( @@ -99,19 +70,13 @@ def __init__( Args: epsilon (float): Exploration probability in [0.0, 1.0]. Defaults to 0.2. pool_threshold (int): Minimum per-(context, technique) attempts before - the local estimate replaces the pooled-global estimate. Until this - threshold is reached, the selector uses the technique's average - across all contexts. Must be >= 1; set to 1 to disable pooling. - Defaults to 3. - rng (random.Random | None): A ``random.Random`` instance for - reproducible selection decisions. Using a dedicated RNG (rather - than a bare float) enables seeded determinism across the full - sequence of select calls within a run. Defaults to a fresh - unseeded ``random.Random()``. + the local estimate replaces the pooled rate. Must be >= 1; set to 1 + to disable pooling. Defaults to 3. + rng (random.Random | None): RNG for reproducible decisions. Defaults + to a fresh unseeded ``random.Random()``. Raises: - ValueError: If ``epsilon`` is outside [0.0, 1.0] or - ``pool_threshold`` is < 1. + ValueError: If ``epsilon`` is outside [0.0, 1.0] or ``pool_threshold`` < 1. """ if not 0.0 <= epsilon <= 1.0: raise ValueError(f"epsilon must be in [0.0, 1.0], got {epsilon}") @@ -122,17 +87,18 @@ def __init__( self._pool_threshold = pool_threshold self._rng = rng if rng is not None else random.Random() self._counts: dict[tuple[str, str], tuple[int, int]] = {} - # Per-arm pooled counts, kept in sync with ``_counts`` in ``update`` so - # ``_estimate``'s pooled-backoff branch is O(1). + # Per-technique pooled counts, kept in sync with ``_counts`` so the + # pooled-backoff branch in ``_estimate`` is O(1). self._global_counts: dict[str, tuple[int, int]] = {} + # Guards _counts, _global_counts, and _rng against concurrent callers. + self._lock = threading.Lock() def select(self, *, context: str, techniques: Sequence[str]) -> str: - """ - Pick the next technique to try for ``context``. + """Pick the next technique to try for ``context``. Args: - context (str): The context key (e.g. ``"_global"`` or a harm category). - techniques (Sequence[str]): The candidate technique names. + context (str): The context key. + techniques (Sequence[str]): Candidate technique names. Returns: str: The chosen technique name. @@ -144,59 +110,56 @@ def select(self, *, context: str, techniques: Sequence[str]) -> str: if not technique_list: raise ValueError("techniques must contain at least one entry") - if self._rng.random() < self._epsilon: - return self._rng.choice(technique_list) + with self._lock: + if self._rng.random() < self._epsilon: + return self._rng.choice(technique_list) - estimates = {t: self._estimate(context=context, technique=t) for t in technique_list} - best = max(estimates.values()) - winners = [t for t, value in estimates.items() if value >= best - self._TIE_TOL] - return self._rng.choice(winners) + estimates = {t: self._estimate(context=context, technique=t) for t in technique_list} + best = max(estimates.values()) + winners = [t for t, value in estimates.items() if value >= best - self._TIE_TOL] + return self._rng.choice(winners) def record_outcome(self, *, context: str, technique: str, success: bool) -> None: - """ - Record the outcome of an attack attempt for a given technique and context. + """Record the outcome of an attempt. Args: context (str): The context key the decision was made under. technique (str): The technique that was tried. success (bool): Whether the attempt succeeded. """ - successes, attempts = self._counts.get((context, technique), (0, 0)) - attempts += 1 - if success: - successes += 1 - self._counts[(context, technique)] = (successes, attempts) - - global_successes, global_attempts = self._global_counts.get(technique, (0, 0)) - global_attempts += 1 - if success: - global_successes += 1 - self._global_counts[technique] = (global_successes, global_attempts) + with self._lock: + successes, attempts = self._counts.get((context, technique), (0, 0)) + attempts += 1 + if success: + successes += 1 + self._counts[(context, technique)] = (successes, attempts) + + global_successes, global_attempts = self._global_counts.get(technique, (0, 0)) + global_attempts += 1 + if success: + global_successes += 1 + self._global_counts[technique] = (global_successes, global_attempts) def success_rate(self, *, context: str, technique: str) -> float: - """ - Return the Laplace-smoothed success-rate estimate for a technique in a context. - - The "smoothed" rate is ``(successes + 1) / (attempts + 1)`` — Laplace smoothing - provides an optimistic prior for unseen techniques (estimate = 1.0) and avoids - division by zero. This is the same value used internally for exploitation decisions. - """ - return self._estimate(context=context, technique=technique) + """Return the Laplace-smoothed estimate ``(s + 1) / (n + 1)`` used for exploitation.""" + with self._lock: + return self._estimate(context=context, technique=technique) def counts(self, *, context: str, technique: str) -> tuple[int, int]: """Return raw ``(successes, attempts)`` for a ``(context, technique)`` cell.""" - return self._counts.get((context, technique), (0, 0)) + with self._lock: + return self._counts.get((context, technique), (0, 0)) def snapshot(self) -> dict[tuple[str, str], tuple[int, int]]: """Return a shallow copy of the full counts table (for logging/debug).""" - return dict(self._counts) + with self._lock: + return dict(self._counts) def _estimate(self, *, context: str, technique: str) -> float: - """ - Laplace-smoothed success-rate estimate for ``(context, technique)``. + """Estimate for ``(context, technique)``; falls back to pooled rate below + ``pool_threshold`` local attempts. - Below ``pool_threshold`` local attempts, the estimate uses the - pooled-global success rate for the technique across all contexts. + Callers must already hold ``self._lock``. """ local_s, local_n = self._counts.get((context, technique), (0, 0)) if local_n >= self._pool_threshold: diff --git a/pyrit/scenario/scenarios/adaptive/text_adaptive.py b/pyrit/scenario/scenarios/adaptive/text_adaptive.py index 88885fef9..554c1ee10 100644 --- a/pyrit/scenario/scenarios/adaptive/text_adaptive.py +++ b/pyrit/scenario/scenarios/adaptive/text_adaptive.py @@ -1,51 +1,30 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -""" -TextAdaptive scenario — picks attack techniques per-objective using an -epsilon-greedy selector informed by observed per-run success rates. - -Unlike static scenarios (which run every selected technique against every -objective), TextAdaptive runs **up to** ``max_attempts_per_objective`` -techniques per objective and stops early when one succeeds. Which technique -to try next is decided by an ``AdaptiveTechniqueSelector`` whose estimates are -updated after every attempt. +"""``TextAdaptive`` — text adaptive scenario. -The set of available techniques comes from the selected scenario strategies, so -``--strategies single_turn`` restricts the selector to single-turn techniques, -etc. The default selector uses a single global context; pass a different -``context_extractor`` (e.g., ``harm_category_context``) to partition estimates -per category. +Picks attack techniques per-objective using an epsilon-greedy selector +informed by observed success rates. Runs up to ``max_attempts_per_objective`` +techniques per objective and stops early on success. The available techniques +come from the selected scenario strategies (``--strategies single_turn`` +restricts to single-turn techniques, etc.). """ from __future__ import annotations import logging -import random -import uuid -from typing import TYPE_CHECKING, Any, ClassVar, cast +from typing import ClassVar from pyrit.common import apply_defaults -from pyrit.executor.attack import AttackScoringConfig from pyrit.registry.tag_query import TagQuery from pyrit.scenario.core.dataset_configuration import DatasetConfiguration -from pyrit.scenario.core.scenario import BaselinePolicy, Scenario from pyrit.scenario.core.scenario_strategy import ScenarioStrategy -from pyrit.scenario.scenarios.adaptive.dispatcher import ( - ADAPTIVE_CONTEXT_LABEL, - AdaptiveDispatchAttack, -) +from pyrit.scenario.scenarios.adaptive.adaptive_scenario import AdaptiveScenario from pyrit.scenario.scenarios.adaptive.selector import ( - AdaptiveTechniqueSelector, ContextExtractor, global_context, ) - -if TYPE_CHECKING: - from pyrit.executor.attack.core.attack_strategy import AttackStrategy - from pyrit.models import AttackResult, SeedAttackGroup - from pyrit.scenario.core.atomic_attack import AtomicAttack - from pyrit.score import TrueFalseScorer +from pyrit.score import TrueFalseScorer logger = logging.getLogger(__name__) @@ -68,32 +47,18 @@ def _build_text_adaptive_strategy() -> type[ScenarioStrategy]: ) -class TextAdaptive(Scenario): - """ - Adaptive text-attack scenario that selects techniques per-objective using - an epsilon-greedy selector over the set of selected strategies. - - The selector: - - Picks a technique uniformly at random with probability ``epsilon``. - - Otherwise exploits the highest observed success rate. Unseen techniques - have an optimistic prior so the first few objectives effectively - round-robin through every available technique. - - Pools across contexts when a context has fewer than - ``pool_threshold`` observations for a technique. +class TextAdaptive(AdaptiveScenario): + """Adaptive text-attack scenario. - A baseline ``PromptSendingAttack`` is **not** prepended — every objective - runs through the dispatcher, and ``prompt_sending`` participates as one of - the selector's techniques. + Selects techniques per-objective via an epsilon-greedy selector over the + set of selected strategies. ``prompt_sending`` participates as one of the + selector's techniques rather than being prepended as a baseline. """ VERSION: int = 1 - BASELINE_POLICY: ClassVar[BaselinePolicy] = BaselinePolicy.Forbidden + _atomic_attack_prefix: ClassVar[str] = "adaptive" _cached_strategy_class: ClassVar[type[ScenarioStrategy] | None] = None - # ------------------------------------------------------------------ # - # Required class-method overrides # - # ------------------------------------------------------------------ # - @classmethod def get_strategy_class(cls) -> type[ScenarioStrategy]: if cls._cached_strategy_class is None: @@ -121,10 +86,6 @@ def required_datasets(cls) -> list[str]: def default_dataset_config(cls) -> DatasetConfiguration: return DatasetConfiguration(dataset_names=cls.required_datasets(), max_dataset_size=4) - # ------------------------------------------------------------------ # - # Constructor # - # ------------------------------------------------------------------ # - @apply_defaults def __init__( self, @@ -140,137 +101,24 @@ def __init__( """ Args: objective_scorer (TrueFalseScorer | None): Scorer used to judge each - response. Defaults to the composite scorer built from the base class. + response. Defaults to the composite scorer from the base class. epsilon (float): Exploration probability for the selector. Defaults to 0.2. - pool_threshold (int): Minimum per-(context, technique) attempts before the - local estimate overrides the pooled-global estimate. Set to 1 to - disable pooling. Defaults to 3. - max_attempts_per_objective (int): Maximum techniques tried per - objective before giving up. Defaults to 3. - seed (int | None): RNG seed for deterministic selection decisions. - Defaults to ``None`` (non-deterministic). - context_extractor (ContextExtractor): Function mapping a - ``SeedAttackGroup`` to a context key. Defaults to - ``global_context`` (one shared selection table). Use - ``harm_category_context`` to partition estimates by harm category. - scenario_result_id (str | None): ID of an existing ``ScenarioResult`` - to resume. + pool_threshold (int): Minimum per-(context, technique) attempts before + the local estimate overrides the pooled rate. Set to 1 to disable + pooling. Defaults to 3. + max_attempts_per_objective (int): Max techniques per objective. Defaults to 3. + seed (int | None): RNG seed for deterministic selection. Defaults to ``None``. + context_extractor (ContextExtractor): Maps a ``SeedAttackGroup`` to a + context key. Defaults to ``global_context``. Use + ``harm_category_context`` to partition by harm category. + scenario_result_id (str | None): ID of an existing ``ScenarioResult`` to resume. """ - if not objective_scorer: - objective_scorer = self._get_default_objective_scorer() - - self._epsilon = epsilon - self._pool_threshold = pool_threshold - self._max_attempts_per_objective = max_attempts_per_objective - self._seed = seed - self._context_extractor = context_extractor - super().__init__( - version=self.VERSION, - strategy_class=self.get_strategy_class(), objective_scorer=objective_scorer, + epsilon=epsilon, + pool_threshold=pool_threshold, + max_attempts_per_objective=max_attempts_per_objective, + seed=seed, + context_extractor=context_extractor, scenario_result_id=scenario_result_id, ) - - # ------------------------------------------------------------------ # - # Override atomic-attack construction # - # ------------------------------------------------------------------ # - - async def _get_atomic_attacks_async(self) -> list[AtomicAttack]: - """ - Build one ``AtomicAttack`` per objective, all sharing a single - ``AdaptiveDispatchAttack`` (and therefore a single - ``AdaptiveTechniqueSelector``). - - Each per-objective ``AtomicAttack`` consults and updates the same - selector via the same dispatcher instance, so learning from one - objective immediately benefits the next. - """ - if self._objective_target is None: - raise ValueError("objective_target must be set before creating attacks") - - selected_techniques = sorted({s.value for s in self._scenario_strategies}) - factories = self._get_attack_technique_factories() - - # Build each technique's inner attack once and reuse across all objectives. - # Skip factories that require a seed_technique (e.g. crescendo_simulated) - # since the dispatcher cannot merge technique seeds into the objective's - # seed group at dispatch time. - scoring_config = AttackScoringConfig(objective_scorer=cast("TrueFalseScorer", self._objective_scorer)) - techniques: dict[str, AttackStrategy[Any, AttackResult]] = {} - for technique_name in selected_techniques: - factory = factories.get(technique_name) - if factory is None: - logger.warning(f"No factory for technique '{technique_name}', skipping.") - continue - technique = factory.create( - objective_target=self._objective_target, - attack_scoring_config=scoring_config, - ) - if technique.seed_technique is not None: - logger.debug( - "Skipping technique '%s': requires seed_technique which adaptive dispatch cannot handle.", - technique_name, - ) - continue - techniques[technique_name] = technique.attack - - if not techniques: - raise ValueError( - "TextAdaptive: no usable techniques after resolving strategies. Check the --strategies selection." - ) - - selector = AdaptiveTechniqueSelector( - epsilon=self._epsilon, - pool_threshold=self._pool_threshold, - rng=random.Random(self._seed), - ) - dispatcher = AdaptiveDispatchAttack( - objective_target=self._objective_target, - techniques=techniques, - selector=selector, - max_attempts_per_objective=self._max_attempts_per_objective, - ) - - seed_groups_by_dataset = self._dataset_config.get_seed_attack_groups() - atomic_attacks: list[AtomicAttack] = [] - for dataset_name, seed_groups in seed_groups_by_dataset.items(): - for seed_group in seed_groups: - atomic_attacks.append( - self._build_atomic_for_seed_group( - dataset_name=dataset_name, - seed_group=seed_group, - dispatcher=dispatcher, - ) - ) - - return atomic_attacks - - def _build_atomic_for_seed_group( - self, - *, - dataset_name: str, - seed_group: SeedAttackGroup, - dispatcher: AdaptiveDispatchAttack, - ) -> AtomicAttack: - from pyrit.scenario.core.atomic_attack import AtomicAttack - from pyrit.scenario.core.attack_technique import AttackTechnique - - adaptive_context = self._context_extractor(seed_group) - # Use the objective's id when available so resume keys are stable across - # runs that re-fetch the same seed groups; fall back to a random uuid. - objective_id = seed_group.objective.id if seed_group.objective.id else uuid.uuid4() - atomic_attack_name = f"adaptive_{dataset_name}_{objective_id}" - - memory_labels = { - **self._memory_labels, - ADAPTIVE_CONTEXT_LABEL: adaptive_context, - } - return AtomicAttack( - atomic_attack_name=atomic_attack_name, - attack_technique=AttackTechnique(attack=dispatcher), - seed_groups=[seed_group], - objective_scorer=cast("TrueFalseScorer", self._objective_scorer), - memory_labels=memory_labels, - display_group=dataset_name, - ) diff --git a/tests/unit/scenario/scenarios/adaptive/test_dispatcher.py b/tests/unit/scenario/scenarios/adaptive/test_dispatcher.py index 87170faa1..422e3d431 100644 --- a/tests/unit/scenario/scenarios/adaptive/test_dispatcher.py +++ b/tests/unit/scenario/scenarios/adaptive/test_dispatcher.py @@ -200,6 +200,39 @@ async def test_metadata_records_adaptive_trail(self, target, selector): ] assert result.metadata["adaptive_context"] == GLOBAL_CONTEXT + async def test_returns_fresh_result_distinct_from_inner(self, target, selector): + # The dispatcher must NOT return the inner attack's ``AttackResult`` + # instance — doing so would cause a duplicate-PK insert when both the + # inner and the dispatcher's ``execute_async`` post-execute hooks try + # to persist the same row. Verify the returned result has a fresh + # ``attack_result_id`` while preserving the inner's identifying fields + # and stamping the dispatch trail. + a = _make_inner_attack(name="a", outcomes=[AttackOutcome.SUCCESS]) + dispatcher = AdaptiveDispatchAttack( + objective_target=target, + techniques={"a": a}, + selector=selector, + ) + # Capture the inner result's id by spying on execute_async. + original_execute = a.execute_async + inner_ids: list[str] = [] + + async def _spy(**kwargs): + inner_result = await original_execute(**kwargs) + inner_ids.append(inner_result.attack_result_id) + return inner_result + + a.execute_async = _spy # type: ignore[assignment] + + result = await dispatcher._perform_async(context=_make_context()) + + assert len(inner_ids) == 1 + assert result.attack_result_id != inner_ids[0] + assert result.conversation_id # carried over from inner + assert result.outcome == AttackOutcome.SUCCESS + assert result.metadata["adaptive_attempts"] == [{"technique": "a", "outcome": "success"}] + assert result.metadata["adaptive_context"] == GLOBAL_CONTEXT + @pytest.mark.usefixtures("patch_central_database") class TestValidate: diff --git a/tests/unit/scenario/scenarios/adaptive/test_selector.py b/tests/unit/scenario/scenarios/adaptive/test_selector.py index ab6aae03e..370430497 100644 --- a/tests/unit/scenario/scenarios/adaptive/test_selector.py +++ b/tests/unit/scenario/scenarios/adaptive/test_selector.py @@ -171,8 +171,13 @@ def test_global_context_is_constant(self): def test_harm_category_context_uses_sorted_first_category(self): sg = MagicMock() sg.harm_categories = ["violence", "hate"] - # sorted() ensures deterministic selection regardless of set iteration order - assert harm_category_context(sg) == "hate" + # Multi-category seeds form their own bucket; sorting keeps the key deterministic. + assert harm_category_context(sg) == "hate|violence" + + def test_harm_category_context_single_category(self): + sg = MagicMock() + sg.harm_categories = ["violence"] + assert harm_category_context(sg) == "violence" def test_harm_category_context_falls_back_when_empty(self): sg = MagicMock() @@ -183,3 +188,38 @@ def test_harm_category_context_falls_back_when_none(self): sg = MagicMock() sg.harm_categories = None assert harm_category_context(sg) == UNCATEGORIZED_CONTEXT + + +class TestAdaptiveTechniqueSelectorConcurrency: + """Concurrent record_outcome / select calls must not corrupt counts.""" + + def test_concurrent_record_outcome_preserves_total_attempts(self): + import threading + + selector = _seeded_selector(pool_threshold=1) + threads_per_arm = 8 + attempts_per_thread = 100 + techniques = ["a", "b", "c", "d"] + + def worker(technique: str, success_pattern: list[bool]) -> None: + for ok in success_pattern: + selector.record_outcome(context=GLOBAL_CONTEXT, technique=technique, success=ok) + + threads: list[threading.Thread] = [] + expected_successes: dict[str, int] = dict.fromkeys(techniques, 0) + for t in techniques: + for i in range(threads_per_arm): + pattern = [(j + i) % 2 == 0 for j in range(attempts_per_thread)] + expected_successes[t] += sum(pattern) + threads.append(threading.Thread(target=worker, args=(t, pattern))) + + for th in threads: + th.start() + for th in threads: + th.join() + + # Every increment landed: no lost updates from interleaved read-modify-write. + for t in techniques: + successes, attempts = selector.counts(context=GLOBAL_CONTEXT, technique=t) + assert attempts == threads_per_arm * attempts_per_thread + assert successes == expected_successes[t] diff --git a/tests/unit/scenario/scenarios/adaptive/test_text_adaptive.py b/tests/unit/scenario/scenarios/adaptive/test_text_adaptive.py index c67abc0fc..c32cab41d 100644 --- a/tests/unit/scenario/scenarios/adaptive/test_text_adaptive.py +++ b/tests/unit/scenario/scenarios/adaptive/test_text_adaptive.py @@ -258,6 +258,125 @@ async def test_no_usable_techniques_raises(self, mock_objective_target, mock_obj await scenario._get_atomic_attacks_async() +@pytest.mark.usefixtures(*FIXTURES) +class TestTextAdaptiveSelectorRehydration: + """When resuming, prior dispatch trails should replay into the new selector.""" + + def _build_scenario_no_resume_id(self, *, scorer): + return TextAdaptive(objective_scorer=scorer) + + def test_no_scenario_result_id_is_noop(self, mock_objective_scorer): + from pyrit.scenario.scenarios.adaptive.selector import AdaptiveTechniqueSelector + + scenario = TextAdaptive(objective_scorer=mock_objective_scorer) + selector = AdaptiveTechniqueSelector() + # No scenario_result_id set -> early return, no errors, no replays. + scenario._rehydrate_selector_from_memory(selector=selector, known_techniques={"a", "b"}) + assert selector.snapshot() == {} + + def test_replays_attempts_from_metadata(self, mock_objective_scorer): + from pyrit.models import AttackResult + from pyrit.scenario.scenarios.adaptive.selector import AdaptiveTechniqueSelector + + scenario = TextAdaptive(objective_scorer=mock_objective_scorer, scenario_result_id="rid") + + prior_result = MagicMock() + prior_result.attack_results = { + "adaptive_violence_o1": [ + AttackResult( + conversation_id="c1", + objective="o1", + metadata={ + "adaptive_attempts": [ + {"technique": "a", "outcome": "failure"}, + {"technique": "b", "outcome": "success"}, + ], + "adaptive_context": "violence", + }, + ), + ], + "adaptive_hate_o2": [ + AttackResult( + conversation_id="c2", + objective="o2", + metadata={ + "adaptive_attempts": [{"technique": "a", "outcome": "success"}], + "adaptive_context": "hate", + }, + ), + ], + } + scenario._memory = MagicMock() + scenario._memory.get_scenario_results.return_value = [prior_result] + + selector = AdaptiveTechniqueSelector() + scenario._rehydrate_selector_from_memory(selector=selector, known_techniques={"a", "b"}) + + # Trails replayed verbatim into the per-context table. + assert selector.counts(context="violence", technique="a") == (0, 1) + assert selector.counts(context="violence", technique="b") == (1, 1) + assert selector.counts(context="hate", technique="a") == (1, 1) + + def test_skips_unknown_techniques(self, mock_objective_scorer): + from pyrit.models import AttackResult + from pyrit.scenario.scenarios.adaptive.selector import AdaptiveTechniqueSelector + + scenario = TextAdaptive(objective_scorer=mock_objective_scorer, scenario_result_id="rid") + prior_result = MagicMock() + prior_result.attack_results = { + "x": [ + AttackResult( + conversation_id="c1", + objective="o1", + metadata={ + "adaptive_attempts": [ + {"technique": "removed_technique", "outcome": "success"}, + {"technique": "a", "outcome": "failure"}, + ], + "adaptive_context": "ctx", + }, + ), + ], + } + scenario._memory = MagicMock() + scenario._memory.get_scenario_results.return_value = [prior_result] + + selector = AdaptiveTechniqueSelector() + scenario._rehydrate_selector_from_memory(selector=selector, known_techniques={"a"}) + + # Only the known technique was recorded. + assert selector.counts(context="ctx", technique="a") == (0, 1) + assert selector.counts(context="ctx", technique="removed_technique") == (0, 0) + + def test_ignores_results_without_adaptive_metadata(self, mock_objective_scorer): + from pyrit.models import AttackResult + from pyrit.scenario.scenarios.adaptive.selector import AdaptiveTechniqueSelector + + scenario = TextAdaptive(objective_scorer=mock_objective_scorer, scenario_result_id="rid") + prior_result = MagicMock() + prior_result.attack_results = { + "baseline": [AttackResult(conversation_id="c", objective="o", metadata={})], + } + scenario._memory = MagicMock() + scenario._memory.get_scenario_results.return_value = [prior_result] + + selector = AdaptiveTechniqueSelector() + scenario._rehydrate_selector_from_memory(selector=selector, known_techniques={"a"}) + assert selector.snapshot() == {} + + def test_memory_load_failure_is_swallowed(self, mock_objective_scorer): + from pyrit.scenario.scenarios.adaptive.selector import AdaptiveTechniqueSelector + + scenario = TextAdaptive(objective_scorer=mock_objective_scorer, scenario_result_id="rid") + scenario._memory = MagicMock() + scenario._memory.get_scenario_results.side_effect = RuntimeError("db down") + + selector = AdaptiveTechniqueSelector() + # Must not raise; selector remains empty. + scenario._rehydrate_selector_from_memory(selector=selector, known_techniques={"a"}) + assert selector.snapshot() == {} + + @pytest.mark.usefixtures(*FIXTURES) class TestTextAdaptiveBaselinePolicy: async def test_initialize_async_rejects_explicit_baseline(self, mock_objective_target, mock_objective_scorer): From 2c06a24d006d561c3cedc33827e83fe3135b52bf Mon Sep 17 00:00:00 2001 From: hannahwestra25 Date: Tue, 19 May 2026 13:07:03 -0400 Subject: [PATCH 06/12] pre-commit --- pyrit/scenario/scenarios/adaptive/dispatcher.py | 6 ++++-- pyrit/scenario/scenarios/adaptive/selector.py | 15 ++++++++++----- .../scenario/scenarios/adaptive/text_adaptive.py | 6 ++++-- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/pyrit/scenario/scenarios/adaptive/dispatcher.py b/pyrit/scenario/scenarios/adaptive/dispatcher.py index 6f682fddd..d12fdd2f5 100644 --- a/pyrit/scenario/scenarios/adaptive/dispatcher.py +++ b/pyrit/scenario/scenarios/adaptive/dispatcher.py @@ -1,7 +1,8 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -"""``AdaptiveDispatchAttack`` — picks an inner technique per attempt via an +""" +``AdaptiveDispatchAttack`` — picks an inner technique per attempt via an ``AdaptiveTechniqueSelector``, runs it, records the outcome, and loops up to ``max_attempts_per_objective`` times. Reads the per-objective context key from ``context.memory_labels[ADAPTIVE_CONTEXT_LABEL]`` (falls back to the global context). @@ -46,7 +47,8 @@ class AdaptiveDispatchContext(AttackContext[AttackParameters]): class AdaptiveDispatchAttack(AttackStrategy[AdaptiveDispatchContext, AttackResult]): - """Attack that delegates each attempt to one of several inner techniques, + """ + Attack that delegates each attempt to one of several inner techniques, choosing per attempt via an ``AdaptiveTechniqueSelector``. For each objective, loops up to ``max_attempts_per_objective`` times: diff --git a/pyrit/scenario/scenarios/adaptive/selector.py b/pyrit/scenario/scenarios/adaptive/selector.py index 0add29990..967aee9d1 100644 --- a/pyrit/scenario/scenarios/adaptive/selector.py +++ b/pyrit/scenario/scenarios/adaptive/selector.py @@ -29,7 +29,8 @@ def global_context(_seed_attack_group: SeedAttackGroup) -> str: def harm_category_context(seed_attack_group: SeedAttackGroup) -> str: - """Return a context keyed by the sorted, ``|``-joined harm categories. + """ + Return a context keyed by the sorted, ``|``-joined harm categories. Multi-category seeds form their own bucket; sorting makes the key deterministic. Returns ``UNCATEGORIZED_CONTEXT`` when no categories are set. @@ -41,7 +42,8 @@ def harm_category_context(seed_attack_group: SeedAttackGroup) -> str: class AdaptiveTechniqueSelector: - """Epsilon-greedy selector over attack techniques. + """ + Epsilon-greedy selector over attack techniques. Maintains a ``(context, technique) -> (successes, attempts)`` table. With probability ``epsilon`` picks uniformly at random; otherwise picks the @@ -94,7 +96,8 @@ def __init__( self._lock = threading.Lock() def select(self, *, context: str, techniques: Sequence[str]) -> str: - """Pick the next technique to try for ``context``. + """ + Pick the next technique to try for ``context``. Args: context (str): The context key. @@ -120,7 +123,8 @@ def select(self, *, context: str, techniques: Sequence[str]) -> str: return self._rng.choice(winners) def record_outcome(self, *, context: str, technique: str, success: bool) -> None: - """Record the outcome of an attempt. + """ + Record the outcome of an attempt. Args: context (str): The context key the decision was made under. @@ -156,7 +160,8 @@ def snapshot(self) -> dict[tuple[str, str], tuple[int, int]]: return dict(self._counts) def _estimate(self, *, context: str, technique: str) -> float: - """Estimate for ``(context, technique)``; falls back to pooled rate below + """ + Estimate for ``(context, technique)``; falls back to pooled rate below ``pool_threshold`` local attempts. Callers must already hold ``self._lock``. diff --git a/pyrit/scenario/scenarios/adaptive/text_adaptive.py b/pyrit/scenario/scenarios/adaptive/text_adaptive.py index 554c1ee10..bc08edd9c 100644 --- a/pyrit/scenario/scenarios/adaptive/text_adaptive.py +++ b/pyrit/scenario/scenarios/adaptive/text_adaptive.py @@ -1,7 +1,8 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -"""``TextAdaptive`` — text adaptive scenario. +""" +``TextAdaptive`` — text adaptive scenario. Picks attack techniques per-objective using an epsilon-greedy selector informed by observed success rates. Runs up to ``max_attempts_per_objective`` @@ -48,7 +49,8 @@ def _build_text_adaptive_strategy() -> type[ScenarioStrategy]: class TextAdaptive(AdaptiveScenario): - """Adaptive text-attack scenario. + """ + Adaptive text-attack scenario. Selects techniques per-objective via an epsilon-greedy selector over the set of selected strategies. ``prompt_sending`` participates as one of the From 11b39a0708aef6cdc746979efb763a022e143c71 Mon Sep 17 00:00:00 2001 From: hannahwestra25 Date: Tue, 19 May 2026 15:34:25 -0400 Subject: [PATCH 07/12] integrate attack technique group --- doc/code/scenarios/3_adaptive_scenarios.ipynb | 11 +- doc/code/scenarios/3_adaptive_scenarios.py | 11 +- .../scenarios/adaptive/adaptive_scenario.py | 126 +++++++---- .../scenario/scenarios/adaptive/dispatcher.py | 106 ++++++++- .../scenarios/adaptive/test_dispatcher.py | 207 +++++++++++------- .../scenarios/adaptive/test_selector.py | 2 +- .../scenarios/adaptive/test_text_adaptive.py | 158 +++++++++++-- 7 files changed, 462 insertions(+), 159 deletions(-) diff --git a/doc/code/scenarios/3_adaptive_scenarios.ipynb b/doc/code/scenarios/3_adaptive_scenarios.ipynb index 93938c1d6..2067a8896 100644 --- a/doc/code/scenarios/3_adaptive_scenarios.ipynb +++ b/doc/code/scenarios/3_adaptive_scenarios.ipynb @@ -245,8 +245,15 @@ "source": [ "## Resuming a run\n", "\n", - "Adaptive scenarios are resumable — pass `scenario_result_id=...` to `initialize_async`\n", - "and the run picks up where it left off, with prior outcomes replayed into the selector." + "Adaptive scenarios are resumable — pass `scenario_result_id=...` to the `TextAdaptive`\n", + "constructor and the run picks up where it left off, with prior outcomes replayed into\n", + "the selector.\n", + "\n", + "```python\n", + "resumed_scenario = TextAdaptive(scenario_result_id=\"\")\n", + "await resumed_scenario.initialize_async(objective_target=objective_target)\n", + "resumed_result = await resumed_scenario.run_async()\n", + "```" ] } ], diff --git a/doc/code/scenarios/3_adaptive_scenarios.py b/doc/code/scenarios/3_adaptive_scenarios.py index 27a4cffbd..038561903 100644 --- a/doc/code/scenarios/3_adaptive_scenarios.py +++ b/doc/code/scenarios/3_adaptive_scenarios.py @@ -160,5 +160,12 @@ # %% [markdown] # ## Resuming a run # -# Adaptive scenarios are resumable — pass `scenario_result_id=...` to `initialize_async` -# and the run picks up where it left off, with prior outcomes replayed into the selector. +# Adaptive scenarios are resumable — pass `scenario_result_id=...` to the `TextAdaptive` +# constructor and the run picks up where it left off, with prior outcomes replayed into +# the selector. +# +# ```python +# resumed_scenario = TextAdaptive(scenario_result_id="") +# await resumed_scenario.initialize_async(objective_target=objective_target) +# resumed_result = await resumed_scenario.run_async() +# ``` diff --git a/pyrit/scenario/scenarios/adaptive/adaptive_scenario.py b/pyrit/scenario/scenarios/adaptive/adaptive_scenario.py index b14cc28c4..6edf753e2 100644 --- a/pyrit/scenario/scenarios/adaptive/adaptive_scenario.py +++ b/pyrit/scenario/scenarios/adaptive/adaptive_scenario.py @@ -1,7 +1,8 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -"""``AdaptiveScenario`` — modality-agnostic base for scenarios that pick attack +""" +``AdaptiveScenario`` — modality-agnostic base for scenarios that pick attack techniques per-objective using an ``AdaptiveTechniqueSelector``. Owns selector wiring, dispatcher construction, per-objective atomic-attack @@ -18,7 +19,7 @@ import logging import random import uuid -from typing import TYPE_CHECKING, Any, ClassVar +from typing import TYPE_CHECKING, ClassVar from pyrit.executor.attack import AttackScoringConfig from pyrit.scenario.core.atomic_attack import AtomicAttack @@ -27,6 +28,7 @@ from pyrit.scenario.scenarios.adaptive.dispatcher import ( ADAPTIVE_CONTEXT_LABEL, AdaptiveDispatchAttack, + TechniqueBundle, ) from pyrit.scenario.scenarios.adaptive.selector import ( AdaptiveTechniqueSelector, @@ -35,8 +37,7 @@ ) if TYPE_CHECKING: - from pyrit.executor.attack.core.attack_strategy import AttackStrategy - from pyrit.models import AttackResult, SeedAttackGroup + from pyrit.models import SeedAttackGroup from pyrit.prompt_target import PromptTarget from pyrit.score import TrueFalseScorer @@ -44,7 +45,8 @@ class AdaptiveScenario(Scenario): - """Abstract base for adaptive (epsilon-greedy) scenarios. + """ + Abstract base for adaptive (epsilon-greedy) scenarios. Subclasses must implement the standard ``Scenario`` class-method overrides and declare ``VERSION`` and ``_atomic_attack_prefix``. Selector wiring, @@ -103,8 +105,14 @@ def __init__( ) async def _get_atomic_attacks_async(self) -> list[AtomicAttack]: - """Build one ``AtomicAttack`` per objective, all sharing a single - ``AdaptiveDispatchAttack`` (and therefore a single selector). + """ + Build one ``AtomicAttack`` per objective. + + Each objective gets a freshly constructed ``AdaptiveDispatchAttack`` + bound to its seed group, but all dispatchers share the same selector + so learning accumulates across objectives. Per-objective, techniques + whose ``seed_technique`` is incompatible with the seed group are + filtered out; objectives left with no compatible techniques are skipped. """ if self._objective_target is None: raise ValueError("objective_target must be set before creating attacks") @@ -119,24 +127,18 @@ async def _get_atomic_attacks_async(self) -> list[AtomicAttack]: # On resume, replay prior attempt outcomes from persisted metadata. self._rehydrate_selector_from_memory(selector=selector, known_techniques=set(techniques)) - dispatcher = AdaptiveDispatchAttack( - objective_target=self._objective_target, - techniques=techniques, - selector=selector, - max_attempts_per_objective=self._max_attempts_per_objective, - ) - seed_groups_by_dataset = self._dataset_config.get_seed_attack_groups() atomic_attacks: list[AtomicAttack] = [] for dataset_name, seed_groups in seed_groups_by_dataset.items(): for seed_group in seed_groups: - atomic_attacks.append( - self._build_atomic_for_seed_group( - dataset_name=dataset_name, - seed_group=seed_group, - dispatcher=dispatcher, - ) + atomic = self._build_atomic_for_seed_group( + dataset_name=dataset_name, + seed_group=seed_group, + techniques=techniques, + selector=selector, ) + if atomic is not None: + atomic_attacks.append(atomic) return atomic_attacks @@ -144,13 +146,13 @@ def _build_techniques_dict( self, *, objective_target: PromptTarget, - ) -> dict[str, AttackStrategy[Any, AttackResult]]: - """Resolve selected strategies into a ``{name: inner_attack}`` map. + ) -> dict[str, TechniqueBundle]: + """ + Resolve selected strategies into a ``{name: TechniqueBundle}`` map. - Skips factories not registered for the current modality, and factories - whose technique requires a ``seed_technique`` (e.g. ``crescendo_simulated``) - — the dispatcher has no hook to merge technique seeds into per-objective - seed groups. + Each bundle carries the inner attack strategy along with the factory's + ``seed_technique`` and ``adversarial_chat`` so the dispatcher can + reproduce the static ``AtomicAttack`` execution path per attempt. Raises: ValueError: If no techniques remain after filtering. Includes the @@ -160,8 +162,7 @@ def _build_techniques_dict( factories = self._get_attack_technique_factories() scoring_config = AttackScoringConfig(objective_scorer=self._objective_scorer) - techniques: dict[str, AttackStrategy[Any, AttackResult]] = {} - skipped_seed_technique: list[str] = [] + techniques: dict[str, TechniqueBundle] = {} skipped_no_factory: list[str] = [] for technique_name in selected_techniques: factory = factories.get(technique_name) @@ -173,24 +174,14 @@ def _build_techniques_dict( objective_target=objective_target, attack_scoring_config=scoring_config, ) - if technique.seed_technique is not None: - skipped_seed_technique.append(technique_name) - logger.warning( - "Skipping technique '%s': it requires a seed_technique which the adaptive " - "dispatcher cannot merge into per-objective seed groups. Use a static " - "scenario (e.g. RapidResponse) to run this technique.", - technique_name, - ) - continue - techniques[technique_name] = technique.attack + techniques[technique_name] = TechniqueBundle( + attack=technique.attack, + seed_technique=technique.seed_technique, + adversarial_chat=factory.adversarial_chat, + ) if not techniques: - details: list[str] = [] - if skipped_seed_technique: - details.append(f"skipped (require seed_technique): {sorted(skipped_seed_technique)}") - if skipped_no_factory: - details.append(f"skipped (no factory registered): {sorted(skipped_no_factory)}") - suffix = f" ({'; '.join(details)})" if details else "" + suffix = f" (skipped, no factory registered: {sorted(skipped_no_factory)})" if skipped_no_factory else "" raise ValueError( f"{type(self).__name__}: no usable techniques after resolving strategies. " f"Check the --strategies selection.{suffix}" @@ -203,14 +194,50 @@ def _build_atomic_for_seed_group( *, dataset_name: str, seed_group: SeedAttackGroup, - dispatcher: AdaptiveDispatchAttack, - ) -> AtomicAttack: + techniques: dict[str, TechniqueBundle], + selector: AdaptiveTechniqueSelector, + ) -> AtomicAttack | None: + """ + Build a single ``AtomicAttack`` for one ``SeedAttackGroup``. + + Filters the technique pool down to those whose ``seed_technique`` (if + any) is compatible with this seed group, then constructs a dedicated + ``AdaptiveDispatchAttack`` bound to this seed group. Returns ``None`` + when no techniques are compatible (caller skips the objective). + """ + if self._objective_target is None: # pragma: no cover - defensive + raise ValueError("objective_target must be set before creating attacks") + + compatible: dict[str, TechniqueBundle] = {} + for name, bundle in techniques.items(): + if bundle.seed_technique is None or seed_group.is_compatible_with_technique( + technique=bundle.seed_technique + ): + compatible[name] = bundle + + if not compatible: + logger.warning( + "AdaptiveScenario: no compatible techniques for seed group in dataset '%s' (objective=%r); skipping.", + dataset_name, + seed_group.objective.value, + ) + return None + adaptive_context = self._context_extractor(seed_group) # Prefer the objective's id when available so resume keys stay stable # across re-fetches of the same seed groups. objective_id = seed_group.objective.id if seed_group.objective.id else uuid.uuid4() atomic_attack_name = f"{self._atomic_attack_prefix}_{dataset_name}_{objective_id}" + dispatcher = AdaptiveDispatchAttack( + objective_target=self._objective_target, + techniques=compatible, + selector=selector, + seed_group=seed_group, + objective_scorer=self._objective_scorer, + max_attempts_per_objective=self._max_attempts_per_objective, + ) + memory_labels = { **self._memory_labels, ADAPTIVE_CONTEXT_LABEL: adaptive_context, @@ -230,7 +257,8 @@ def _rehydrate_selector_from_memory( selector: AdaptiveTechniqueSelector, known_techniques: set[str], ) -> None: - """Replay persisted dispatch trails into ``selector`` so resume + """ + Replay persisted dispatch trails into ``selector`` so resume preserves learned state. Iterates every persisted ``AttackResult`` on the resumed @@ -246,9 +274,11 @@ def _rehydrate_selector_from_memory( if not self._scenario_result_id: return + # Narrow to errors a memory backend would plausibly raise (DB/IO + # failures, integrity issues). Programmer-level errors propagate. try: scenario_results = self._memory.get_scenario_results(scenario_result_ids=[self._scenario_result_id]) - except Exception as exc: + except (RuntimeError, OSError, ValueError) as exc: logger.warning(f"AdaptiveScenario: failed to load prior scenario result for rehydration: {exc}") return diff --git a/pyrit/scenario/scenarios/adaptive/dispatcher.py b/pyrit/scenario/scenarios/adaptive/dispatcher.py index d12fdd2f5..8499102ca 100644 --- a/pyrit/scenario/scenarios/adaptive/dispatcher.py +++ b/pyrit/scenario/scenarios/adaptive/dispatcher.py @@ -6,6 +6,10 @@ ``AdaptiveTechniqueSelector``, runs it, records the outcome, and loops up to ``max_attempts_per_objective`` times. Reads the per-objective context key from ``context.memory_labels[ADAPTIVE_CONTEXT_LABEL]`` (falls back to the global context). + +The dispatcher is bound to a single ``SeedAttackGroup`` at construction time so +it can merge each chosen technique's ``seed_technique`` (when present) into the +seed group before delegating execution to ``AttackExecutor``. """ from __future__ import annotations @@ -16,6 +20,7 @@ from datetime import datetime, timezone from typing import TYPE_CHECKING, Any +from pyrit.executor.attack.core.attack_executor import AttackExecutor from pyrit.executor.attack.core.attack_parameters import AttackParameters from pyrit.executor.attack.core.attack_strategy import AttackContext, AttackStrategy from pyrit.models import AttackOutcome, AttackResult @@ -25,7 +30,9 @@ ) if TYPE_CHECKING: + from pyrit.models import SeedAttackGroup, SeedAttackTechniqueGroup from pyrit.prompt_target import PromptTarget + from pyrit.score import TrueFalseScorer logger = logging.getLogger(__name__) @@ -41,6 +48,21 @@ """1-based attempt index within the per-objective loop.""" +@dataclass(frozen=True) +class TechniqueBundle: + """ + Per-technique bundle consumed by the dispatcher. + + Carries the inner attack strategy alongside the factory-supplied + ``seed_technique`` (if any) and ``adversarial_chat`` (required when the + seed_technique contains a simulated-conversation config). + """ + + attack: AttackStrategy[Any, AttackResult] + seed_technique: SeedAttackTechniqueGroup | None = None + adversarial_chat: PromptTarget | None = None + + @dataclass class AdaptiveDispatchContext(AttackContext[AttackParameters]): """Execution context for ``AdaptiveDispatchAttack`` (no extra state).""" @@ -55,23 +77,43 @@ class AdaptiveDispatchAttack(AttackStrategy[AdaptiveDispatchContext, AttackResul ask the selector, execute the chosen technique, record the outcome, and stop early on success. The selector is shared by reference with the scenario so learning accumulates across objectives. + + The dispatcher is bound to a single ``SeedAttackGroup`` at construction + time. When a chosen technique declares a ``seed_technique``, that group + is merged into the seed group before execution (mirroring the static + ``AtomicAttack`` path). + + On success, the dispatcher returns a fresh ``AttackResult`` copy of the + winning inner result (new ``attack_result_id`` and ``timestamp``) with + the dispatch trail stamped onto ``metadata``. The inner result has + already been persisted by its own post-execute hook, so two rows are + written per successful objective sharing the same ``conversation_id``: + the inner row carries the raw outcome, the outer row carries the + adaptive trail. """ def __init__( self, *, objective_target: PromptTarget, - techniques: dict[str, AttackStrategy[Any, AttackResult]], + techniques: dict[str, TechniqueBundle], selector: AdaptiveTechniqueSelector, + seed_group: SeedAttackGroup, + objective_scorer: TrueFalseScorer | None = None, max_attempts_per_objective: int = 3, ) -> None: """ Args: objective_target (PromptTarget): The target inner attacks run against. Stored for identifier/logging parity; not called directly. - techniques (dict[str, AttackStrategy[Any, AttackResult]]): Mapping from - technique name to a pre-built inner attack. Must be non-empty. + techniques (dict[str, TechniqueBundle]): Mapping from technique name to + its bundle (attack, seed_technique, adversarial_chat). Must be non-empty. selector (AdaptiveTechniqueSelector): Shared selector state. + seed_group (SeedAttackGroup): The seed group bound to this dispatcher. + Each attempt's chosen technique is applied against this group + (merging the technique's ``seed_technique`` when present). + objective_scorer (TrueFalseScorer | None): Scorer passed through to + techniques that generate simulated conversations. max_attempts_per_objective (int): Max attempts per objective; >= 1. Defaults to 3. @@ -91,7 +133,13 @@ def __init__( ) self._techniques = techniques self._selector = selector + self._seed_group = seed_group + self._objective_scorer = objective_scorer self._max_attempts = max_attempts_per_objective + # Attempts are inherently sequential (each one reads the selector + # state updated by the previous), so a single shared executor with + # ``max_concurrency=1`` is reused across attempts. + self._executor = AttackExecutor(max_concurrency=1) def _validate_context(self, *, context: AdaptiveDispatchContext) -> None: if not context.objective or context.objective.isspace(): @@ -103,6 +151,51 @@ async def _setup_async(self, *, context: AdaptiveDispatchContext) -> None: async def _teardown_async(self, *, context: AdaptiveDispatchContext) -> None: pass + async def _run_inner_attack_async( + self, + *, + bundle: TechniqueBundle, + attempt_labels: dict[str, str], + ) -> AttackResult: + """ + Execute the chosen technique against this dispatcher's seed group. + + Merges ``bundle.seed_technique`` into the bound ``seed_group`` (when + present) and delegates execution to ``AttackExecutor``. Isolated as a + method so tests can patch the inner-attack call surface. + + Args: + bundle (TechniqueBundle): The chosen technique's attack + seeds + chat. + attempt_labels (dict[str, str]): Memory labels stamped onto this attempt. + + Returns: + AttackResult: The single result produced for this attempt. + + Raises: + RuntimeError: If the executor returned no completed results and no + propagated exception (should be unreachable). + """ + if bundle.seed_technique is not None: + execution_group = self._seed_group.with_technique(technique=bundle.seed_technique) + else: + execution_group = self._seed_group + + executor_result = await self._executor.execute_attack_from_seed_groups_async( + attack=bundle.attack, + seed_groups=[execution_group], + adversarial_chat=bundle.adversarial_chat, + objective_scorer=self._objective_scorer, + memory_labels=attempt_labels, + ) + + if executor_result.completed_results: + return executor_result.completed_results[0] + if executor_result.incomplete_objectives: + raise executor_result.incomplete_objectives[0][1] + raise RuntimeError( # pragma: no cover - defensive + "AttackExecutor returned neither completed nor incomplete results." + ) + async def _perform_async(self, *, context: AdaptiveDispatchContext) -> AttackResult: adaptive_context = context.memory_labels.get(ADAPTIVE_CONTEXT_LABEL, GLOBAL_CONTEXT) technique_names = list(self._techniques.keys()) @@ -112,7 +205,7 @@ async def _perform_async(self, *, context: AdaptiveDispatchContext) -> AttackRes for attempt_idx in range(self._max_attempts): chosen = self._selector.select(context=adaptive_context, techniques=technique_names) - inner = self._techniques[chosen] + bundle = self._techniques[chosen] attempt_labels = { **context.memory_labels, ADAPTIVE_TECHNIQUE_LABEL: chosen, @@ -127,10 +220,7 @@ async def _perform_async(self, *, context: AdaptiveDispatchContext) -> AttackRes chosen, ) - result = await inner.execute_async( - objective=context.objective, - memory_labels=attempt_labels, - ) + result = await self._run_inner_attack_async(bundle=bundle, attempt_labels=attempt_labels) success = result.outcome == AttackOutcome.SUCCESS self._selector.record_outcome(context=adaptive_context, technique=chosen, success=success) diff --git a/tests/unit/scenario/scenarios/adaptive/test_dispatcher.py b/tests/unit/scenario/scenarios/adaptive/test_dispatcher.py index 422e3d431..4be4ffbb6 100644 --- a/tests/unit/scenario/scenarios/adaptive/test_dispatcher.py +++ b/tests/unit/scenario/scenarios/adaptive/test_dispatcher.py @@ -7,13 +7,14 @@ import pytest from pyrit.executor.attack.core.attack_parameters import AttackParameters -from pyrit.models import AttackOutcome, AttackResult +from pyrit.models import AttackOutcome, AttackResult, SeedAttackGroup, SeedObjective from pyrit.scenario.scenarios.adaptive.dispatcher import ( ADAPTIVE_ATTEMPT_LABEL, ADAPTIVE_CONTEXT_LABEL, ADAPTIVE_TECHNIQUE_LABEL, AdaptiveDispatchAttack, AdaptiveDispatchContext, + TechniqueBundle, ) from pyrit.scenario.scenarios.adaptive.selector import ( GLOBAL_CONTEXT, @@ -21,25 +22,52 @@ ) -def _make_inner_attack(*, name: str, outcomes: list[AttackOutcome]) -> MagicMock: - """Build a mocked inner attack whose execute_async returns the given outcomes in order.""" - inner = MagicMock(name=name) - results = [ - AttackResult( - conversation_id=f"conv-{name}-{i}", - objective="obj", - outcome=outcome, - ) - for i, outcome in enumerate(outcomes) - ] - inner.execute_async = AsyncMock(side_effect=results) - return inner +def _make_bundle(*, name: str, outcomes: list[AttackOutcome], seed_technique=None) -> TechniqueBundle: + """Build a TechniqueBundle whose attack stub yields the given outcomes in order. + + The dispatcher routes execution through ``_run_inner_attack_async``; tests + patch that method directly so we only need a placeholder attack here. + """ + attack = MagicMock(name=f"attack-{name}") + attack._outcomes = outcomes + attack._name = name + return TechniqueBundle(attack=attack, seed_technique=seed_technique) def _make_context(*, objective: str = "obj", labels: dict[str, str] | None = None) -> AdaptiveDispatchContext: return AdaptiveDispatchContext(params=AttackParameters(objective=objective, memory_labels=labels or {})) +def _patch_inner( + *, + dispatcher: AdaptiveDispatchAttack, + bundles: dict[str, TechniqueBundle], +) -> AsyncMock: + """Replace ``_run_inner_attack_async`` with a stub backed by per-bundle outcomes. + + Returns the AsyncMock so tests can introspect call history (kwargs include + ``bundle`` and ``attempt_labels``). + """ + # Each call consumes one outcome from the chosen bundle's deque. + name_for_attack = {id(b.attack): name for name, b in bundles.items()} + counters: dict[str, int] = dict.fromkeys(bundles, 0) + + async def _stub(*, bundle: TechniqueBundle, attempt_labels: dict[str, str]) -> AttackResult: + name = name_for_attack[id(bundle.attack)] + idx = counters[name] + counters[name] = idx + 1 + outcome = bundle.attack._outcomes[idx] + return AttackResult( + conversation_id=f"conv-{name}-{idx}", + objective="obj", + outcome=outcome, + ) + + inner_mock = AsyncMock(side_effect=_stub) + dispatcher._run_inner_attack_async = inner_mock # type: ignore[method-assign] + return inner_mock + + @pytest.fixture def selector() -> AdaptiveTechniqueSelector: # epsilon=0 makes selection deterministic given the table. @@ -51,107 +79,117 @@ def target() -> MagicMock: return MagicMock(name="objective_target") +@pytest.fixture +def seed_group() -> SeedAttackGroup: + return SeedAttackGroup(seeds=[SeedObjective(value="obj")]) + + class TestInit: @pytest.mark.usefixtures("patch_central_database") - def test_init_rejects_empty_techniques(self, target, selector): + def test_init_rejects_empty_techniques(self, target, selector, seed_group): with pytest.raises(ValueError, match="techniques"): - AdaptiveDispatchAttack(objective_target=target, techniques={}, selector=selector) + AdaptiveDispatchAttack( + objective_target=target, + techniques={}, + selector=selector, + seed_group=seed_group, + ) @pytest.mark.parametrize("bad_max", [0, -1]) @pytest.mark.usefixtures("patch_central_database") - def test_init_rejects_invalid_max_attempts(self, target, selector, bad_max): + def test_init_rejects_invalid_max_attempts(self, target, selector, seed_group, bad_max): with pytest.raises(ValueError, match="max_attempts_per_objective"): AdaptiveDispatchAttack( objective_target=target, - techniques={"a": _make_inner_attack(name="a", outcomes=[AttackOutcome.SUCCESS])}, + techniques={"a": _make_bundle(name="a", outcomes=[AttackOutcome.SUCCESS])}, selector=selector, + seed_group=seed_group, max_attempts_per_objective=bad_max, ) @pytest.mark.usefixtures("patch_central_database") class TestPerform: - async def test_stops_on_first_success(self, target, selector): - a = _make_inner_attack(name="a", outcomes=[AttackOutcome.SUCCESS]) - b = _make_inner_attack(name="b", outcomes=[AttackOutcome.SUCCESS]) + async def test_stops_on_first_success(self, target, selector, seed_group): + bundles = { + "a": _make_bundle(name="a", outcomes=[AttackOutcome.SUCCESS]), + "b": _make_bundle(name="b", outcomes=[AttackOutcome.SUCCESS]), + } dispatcher = AdaptiveDispatchAttack( objective_target=target, - techniques={"a": a, "b": b}, + techniques=bundles, selector=selector, + seed_group=seed_group, max_attempts_per_objective=5, ) + inner = _patch_inner(dispatcher=dispatcher, bundles=bundles) result = await dispatcher._perform_async(context=_make_context()) assert result.outcome == AttackOutcome.SUCCESS - total_calls = a.execute_async.call_count + b.execute_async.call_count - assert total_calls == 1 + assert inner.call_count == 1 - async def test_retries_until_max_attempts_on_failure(self, target, selector): - a = _make_inner_attack(name="a", outcomes=[AttackOutcome.FAILURE] * 3) - b = _make_inner_attack(name="b", outcomes=[AttackOutcome.FAILURE] * 3) + async def test_retries_until_max_attempts_on_failure(self, target, selector, seed_group): + bundles = { + "a": _make_bundle(name="a", outcomes=[AttackOutcome.FAILURE] * 3), + "b": _make_bundle(name="b", outcomes=[AttackOutcome.FAILURE] * 3), + } dispatcher = AdaptiveDispatchAttack( objective_target=target, - techniques={"a": a, "b": b}, + techniques=bundles, selector=selector, + seed_group=seed_group, max_attempts_per_objective=3, ) + inner = _patch_inner(dispatcher=dispatcher, bundles=bundles) result = await dispatcher._perform_async(context=_make_context()) assert result.outcome == AttackOutcome.FAILURE - total_calls = a.execute_async.call_count + b.execute_async.call_count - assert total_calls == 3 + assert inner.call_count == 3 - async def test_updates_selector_on_each_attempt(self, target, selector): - a = _make_inner_attack(name="a", outcomes=[AttackOutcome.FAILURE, AttackOutcome.SUCCESS]) - b = _make_inner_attack(name="b", outcomes=[AttackOutcome.SUCCESS]) + async def test_updates_selector_on_each_attempt(self, target, selector, seed_group): + bundles = { + "a": _make_bundle(name="a", outcomes=[AttackOutcome.FAILURE, AttackOutcome.SUCCESS]), + "b": _make_bundle(name="b", outcomes=[AttackOutcome.SUCCESS]), + } dispatcher = AdaptiveDispatchAttack( objective_target=target, - techniques={"a": a, "b": b}, + techniques=bundles, selector=selector, + seed_group=seed_group, max_attempts_per_objective=3, ) + inner = _patch_inner(dispatcher=dispatcher, bundles=bundles) await dispatcher._perform_async(context=_make_context()) - # Total attempts across arms must equal sum of selector counts. total_attempts = sum(selector.counts(context=GLOBAL_CONTEXT, technique=t)[1] for t in ("a", "b")) - total_calls = a.execute_async.call_count + b.execute_async.call_count - assert total_attempts == total_calls - - async def test_passes_objective_to_inner(self, target, selector): - a = _make_inner_attack(name="a", outcomes=[AttackOutcome.SUCCESS]) - dispatcher = AdaptiveDispatchAttack( - objective_target=target, - techniques={"a": a}, - selector=selector, - ) + assert total_attempts == inner.call_count - await dispatcher._perform_async(context=_make_context(objective="my-goal")) - - kwargs = a.execute_async.call_args.kwargs - assert kwargs["objective"] == "my-goal" - - async def test_attaches_technique_and_attempt_labels(self, target, selector): - a = _make_inner_attack(name="a", outcomes=[AttackOutcome.SUCCESS]) + async def test_passes_attempt_labels_to_inner(self, target, selector, seed_group): + bundles = {"a": _make_bundle(name="a", outcomes=[AttackOutcome.SUCCESS])} dispatcher = AdaptiveDispatchAttack( objective_target=target, - techniques={"a": a}, + techniques=bundles, selector=selector, + seed_group=seed_group, ) + inner = _patch_inner(dispatcher=dispatcher, bundles=bundles) await dispatcher._perform_async(context=_make_context(labels={"foo": "bar"})) - labels = a.execute_async.call_args.kwargs["memory_labels"] + labels = inner.call_args.kwargs["attempt_labels"] assert labels["foo"] == "bar" # caller labels preserved assert labels[ADAPTIVE_TECHNIQUE_LABEL] == "a" assert labels[ADAPTIVE_ATTEMPT_LABEL] == "1" - async def test_uses_adaptive_context_from_label(self, target, selector): + async def test_uses_adaptive_context_from_label(self, target, selector, seed_group): # Two techniques; one has been heavily rewarded under context "violence" only. - a = _make_inner_attack(name="a", outcomes=[AttackOutcome.SUCCESS]) - b = _make_inner_attack(name="b", outcomes=[AttackOutcome.SUCCESS]) + bundles = { + "a": _make_bundle(name="a", outcomes=[AttackOutcome.SUCCESS]), + "b": _make_bundle(name="b", outcomes=[AttackOutcome.SUCCESS]), + } for _ in range(5): selector.record_outcome(context="violence", technique="b", success=True) for _ in range(5): @@ -159,38 +197,42 @@ async def test_uses_adaptive_context_from_label(self, target, selector): dispatcher = AdaptiveDispatchAttack( objective_target=target, - techniques={"a": a, "b": b}, + techniques=bundles, selector=selector, + seed_group=seed_group, ) + inner = _patch_inner(dispatcher=dispatcher, bundles=bundles) ctx = _make_context(labels={ADAPTIVE_CONTEXT_LABEL: "violence"}) await dispatcher._perform_async(context=ctx) # Exploit should have picked "b" first. - assert b.execute_async.call_count == 1 - assert a.execute_async.call_count == 0 + chosen_bundle = inner.call_args.kwargs["bundle"] + assert chosen_bundle is bundles["b"] - async def test_falls_back_to_global_context_when_label_missing(self, target, selector): - a = _make_inner_attack(name="a", outcomes=[AttackOutcome.SUCCESS]) + async def test_falls_back_to_global_context_when_label_missing(self, target, selector, seed_group): + bundles = {"a": _make_bundle(name="a", outcomes=[AttackOutcome.SUCCESS])} dispatcher = AdaptiveDispatchAttack( objective_target=target, - techniques={"a": a}, + techniques=bundles, selector=selector, + seed_group=seed_group, ) + _patch_inner(dispatcher=dispatcher, bundles=bundles) await dispatcher._perform_async(context=_make_context(labels={})) # The global context bucket received the update. assert selector.counts(context=GLOBAL_CONTEXT, technique="a") == (1, 1) - async def test_metadata_records_adaptive_trail(self, target, selector): - # Technique "a" fails on the first attempt then succeeds; verify the trail - # captures both attempts in order. - a = _make_inner_attack(name="a", outcomes=[AttackOutcome.FAILURE, AttackOutcome.SUCCESS]) + async def test_metadata_records_adaptive_trail(self, target, selector, seed_group): + bundles = {"a": _make_bundle(name="a", outcomes=[AttackOutcome.FAILURE, AttackOutcome.SUCCESS])} dispatcher = AdaptiveDispatchAttack( objective_target=target, - techniques={"a": a}, + techniques=bundles, selector=selector, + seed_group=seed_group, max_attempts_per_objective=3, ) + _patch_inner(dispatcher=dispatcher, bundles=bundles) result = await dispatcher._perform_async(context=_make_context()) trail = result.metadata["adaptive_attempts"] @@ -200,29 +242,32 @@ async def test_metadata_records_adaptive_trail(self, target, selector): ] assert result.metadata["adaptive_context"] == GLOBAL_CONTEXT - async def test_returns_fresh_result_distinct_from_inner(self, target, selector): + async def test_returns_fresh_result_distinct_from_inner(self, target, selector, seed_group): # The dispatcher must NOT return the inner attack's ``AttackResult`` # instance — doing so would cause a duplicate-PK insert when both the # inner and the dispatcher's ``execute_async`` post-execute hooks try # to persist the same row. Verify the returned result has a fresh # ``attack_result_id`` while preserving the inner's identifying fields # and stamping the dispatch trail. - a = _make_inner_attack(name="a", outcomes=[AttackOutcome.SUCCESS]) + bundles = {"a": _make_bundle(name="a", outcomes=[AttackOutcome.SUCCESS])} dispatcher = AdaptiveDispatchAttack( objective_target=target, - techniques={"a": a}, + techniques=bundles, selector=selector, + seed_group=seed_group, ) - # Capture the inner result's id by spying on execute_async. - original_execute = a.execute_async inner_ids: list[str] = [] - async def _spy(**kwargs): - inner_result = await original_execute(**kwargs) + async def _spy(*, bundle, attempt_labels): + inner_result = AttackResult( + conversation_id="conv-a-0", + objective="obj", + outcome=AttackOutcome.SUCCESS, + ) inner_ids.append(inner_result.attack_result_id) return inner_result - a.execute_async = _spy # type: ignore[assignment] + dispatcher._run_inner_attack_async = AsyncMock(side_effect=_spy) # type: ignore[method-assign] result = await dispatcher._perform_async(context=_make_context()) @@ -237,20 +282,22 @@ async def _spy(**kwargs): @pytest.mark.usefixtures("patch_central_database") class TestValidate: @pytest.mark.parametrize("bad_objective", ["", " ", "\n\t"]) - def test_validate_rejects_empty_objective(self, target, selector, bad_objective): + def test_validate_rejects_empty_objective(self, target, selector, seed_group, bad_objective): dispatcher = AdaptiveDispatchAttack( objective_target=target, - techniques={"a": _make_inner_attack(name="a", outcomes=[AttackOutcome.SUCCESS])}, + techniques={"a": _make_bundle(name="a", outcomes=[AttackOutcome.SUCCESS])}, selector=selector, + seed_group=seed_group, ) with pytest.raises(ValueError, match="objective"): dispatcher._validate_context(context=_make_context(objective=bad_objective)) - def test_validate_accepts_normal_objective(self, target, selector): + def test_validate_accepts_normal_objective(self, target, selector, seed_group): dispatcher = AdaptiveDispatchAttack( objective_target=target, - techniques={"a": _make_inner_attack(name="a", outcomes=[AttackOutcome.SUCCESS])}, + techniques={"a": _make_bundle(name="a", outcomes=[AttackOutcome.SUCCESS])}, selector=selector, + seed_group=seed_group, ) # Does not raise. dispatcher._validate_context(context=_make_context(objective="ok")) diff --git a/tests/unit/scenario/scenarios/adaptive/test_selector.py b/tests/unit/scenario/scenarios/adaptive/test_selector.py index 370430497..2daba3b70 100644 --- a/tests/unit/scenario/scenarios/adaptive/test_selector.py +++ b/tests/unit/scenario/scenarios/adaptive/test_selector.py @@ -168,7 +168,7 @@ def test_global_context_is_constant(self): sg = MagicMock() assert global_context(sg) == GLOBAL_CONTEXT - def test_harm_category_context_uses_sorted_first_category(self): + def test_harm_category_context_joins_sorted_categories(self): sg = MagicMock() sg.harm_categories = ["violence", "hate"] # Multi-category seeds form their own bucket; sorting keeps the key deterministic. diff --git a/tests/unit/scenario/scenarios/adaptive/test_text_adaptive.py b/tests/unit/scenario/scenarios/adaptive/test_text_adaptive.py index c32cab41d..12b1a45e2 100644 --- a/tests/unit/scenario/scenarios/adaptive/test_text_adaptive.py +++ b/tests/unit/scenario/scenarios/adaptive/test_text_adaptive.py @@ -87,6 +87,21 @@ def _make_seed_group(*, value: str, harm_categories: list[str] | None = None) -> return SeedAttackGroup(seeds=[SeedObjective(value=value, harm_categories=harm_categories)]) +def _make_fake_factory(*, seed_technique=None, adversarial_chat=None) -> MagicMock: + """Return a stub attack-technique factory that produces a fake ``AttackTechnique``. + + Mocks the surface ``AdaptiveScenario._build_techniques_dict`` consumes + (``factory.create(...)`` and ``factory.adversarial_chat``). + """ + fake_technique = MagicMock() + fake_technique.attack = MagicMock(name="fake-attack-strategy") + fake_technique.seed_technique = seed_technique + factory = MagicMock() + factory.create.return_value = fake_technique + factory.adversarial_chat = adversarial_chat + return factory + + FIXTURES = ["patch_central_database", "mock_runtime_env"] @@ -174,21 +189,26 @@ async def test_one_atomic_per_objective(self, mock_objective_target, mock_object # Each atomic carries exactly one seed group. assert len(atomic.objectives) == 1 - async def test_all_atomics_share_one_dispatcher(self, mock_objective_target, mock_objective_scorer): + async def test_atomics_share_one_selector_across_dispatchers(self, mock_objective_target, mock_objective_scorer): groups = { "violence": [ _make_seed_group(value="obj-v1", harm_categories=["violence"]), _make_seed_group(value="obj-v2", harm_categories=["violence"]), ], } - scenario, attacks = await self._build_scenario_and_attacks( + _scenario, attacks = await self._build_scenario_and_attacks( mock_objective_target=mock_objective_target, mock_objective_scorer=mock_objective_scorer, seed_groups=groups, ) - dispatchers = {atomic._attack_technique.attack for atomic in attacks} - assert len(dispatchers) == 1 - assert isinstance(next(iter(dispatchers)), AdaptiveDispatchAttack) + dispatchers = [atomic._attack_technique.attack for atomic in attacks] + # Each objective gets its own dispatcher (bound to its own seed group)... + assert len({id(d) for d in dispatchers}) == len(attacks) + for d in dispatchers: + assert isinstance(d, AdaptiveDispatchAttack) + # ...but they all share the same selector so learning is global. + selectors = {id(d._selector) for d in dispatchers} + assert len(selectors) == 1 async def test_global_context_label_when_using_global_extractor(self, mock_objective_target, mock_objective_scorer): groups = { @@ -257,6 +277,112 @@ async def test_no_usable_techniques_raises(self, mock_objective_target, mock_obj with pytest.raises(ValueError, match="no usable techniques"): await scenario._get_atomic_attacks_async() + async def test_techniques_with_seed_technique_are_kept(self, mock_objective_target, mock_objective_scorer): + """Factories that declare a ``seed_technique`` participate in the pool + (the old behavior silently dropped them with a warning). + """ + groups = {"violence": [_make_seed_group(value="obj")]} + plain_factory = _make_fake_factory(seed_technique=None) + seeded_factory = _make_fake_factory(seed_technique=MagicMock(name="seed_technique")) + + with ( + patch.object(DatasetConfiguration, "get_seed_attack_groups", return_value=groups), + patch.object(SeedAttackGroup, "is_compatible_with_technique", return_value=True), + ): + scenario = TextAdaptive(objective_scorer=mock_objective_scorer) + with patch.object( + scenario, + "_get_attack_technique_factories", + return_value={"prompt_sending": plain_factory, "many_shot": seeded_factory}, + ): + await scenario.initialize_async( + objective_target=mock_objective_target, + include_baseline=False, + ) + attacks = scenario._atomic_attacks + + assert len(attacks) == 1 + dispatcher = attacks[0]._attack_technique.attack + assert isinstance(dispatcher, AdaptiveDispatchAttack) + # Both factories survive; in particular the seeded one is no longer + # silently dropped. + assert "prompt_sending" in dispatcher._techniques + assert "many_shot" in dispatcher._techniques + + async def test_incompatible_seed_technique_is_filtered_per_objective( + self, mock_objective_target, mock_objective_scorer + ): + """Per-objective candidate pool drops techniques whose seed_technique + is incompatible with the seed group; compatible techniques remain. + """ + groups = {"violence": [_make_seed_group(value="obj")]} + plain_factory = _make_fake_factory(seed_technique=None) + incompatible_factory = _make_fake_factory(seed_technique=MagicMock(name="incompatible_seed_technique")) + + with ( + patch.object(DatasetConfiguration, "get_seed_attack_groups", return_value=groups), + patch.object(SeedAttackGroup, "is_compatible_with_technique", return_value=False), + ): + scenario = TextAdaptive(objective_scorer=mock_objective_scorer) + with patch.object( + scenario, + "_get_attack_technique_factories", + return_value={"prompt_sending": plain_factory, "many_shot": incompatible_factory}, + ): + await scenario.initialize_async( + objective_target=mock_objective_target, + include_baseline=False, + ) + attacks = scenario._atomic_attacks + + assert len(attacks) == 1 + dispatcher = attacks[0]._attack_technique.attack + # Only the plain technique survives; the seed_technique-bearing one is filtered out + # because is_compatible_with_technique returned False. + assert "prompt_sending" in dispatcher._techniques + assert "many_shot" not in dispatcher._techniques + + async def test_objective_skipped_when_no_compatible_techniques( + self, mock_objective_target, mock_objective_scorer, caplog + ): + """When every technique requires an incompatible seed_technique, the + objective is dropped with a warning rather than producing an atomic + attack with an empty technique pool. + """ + groups = { + "violence": [_make_seed_group(value="obj-keep")], + "hate": [_make_seed_group(value="obj-skip")], + } + seeded_factory = _make_fake_factory(seed_technique=MagicMock(name="seed_technique")) + + # is_compatible_with_technique returns True for "obj-keep", False for "obj-skip". + def _selective_compat(self_group, *, technique): + return self_group.objective.value == "obj-keep" + + with ( + patch.object(DatasetConfiguration, "get_seed_attack_groups", return_value=groups), + patch.object(SeedAttackGroup, "is_compatible_with_technique", _selective_compat), + ): + scenario = TextAdaptive(objective_scorer=mock_objective_scorer) + with patch.object( + scenario, + "_get_attack_technique_factories", + return_value={"prompt_sending": seeded_factory}, + ): + import logging + + with caplog.at_level(logging.WARNING): + await scenario.initialize_async( + objective_target=mock_objective_target, + include_baseline=False, + ) + attacks = scenario._atomic_attacks + + # Only the compatible objective produced an atomic attack. + assert len(attacks) == 1 + # Skip was logged with the affected objective value. + assert any("obj-skip" in record.getMessage() for record in caplog.records) + @pytest.mark.usefixtures(*FIXTURES) class TestTextAdaptiveSelectorRehydration: @@ -306,11 +432,10 @@ def test_replays_attempts_from_metadata(self, mock_objective_scorer): ), ], } - scenario._memory = MagicMock() - scenario._memory.get_scenario_results.return_value = [prior_result] selector = AdaptiveTechniqueSelector() - scenario._rehydrate_selector_from_memory(selector=selector, known_techniques={"a", "b"}) + with patch.object(scenario._memory, "get_scenario_results", return_value=[prior_result]): + scenario._rehydrate_selector_from_memory(selector=selector, known_techniques={"a", "b"}) # Trails replayed verbatim into the per-context table. assert selector.counts(context="violence", technique="a") == (0, 1) @@ -338,11 +463,10 @@ def test_skips_unknown_techniques(self, mock_objective_scorer): ), ], } - scenario._memory = MagicMock() - scenario._memory.get_scenario_results.return_value = [prior_result] selector = AdaptiveTechniqueSelector() - scenario._rehydrate_selector_from_memory(selector=selector, known_techniques={"a"}) + with patch.object(scenario._memory, "get_scenario_results", return_value=[prior_result]): + scenario._rehydrate_selector_from_memory(selector=selector, known_techniques={"a"}) # Only the known technique was recorded. assert selector.counts(context="ctx", technique="a") == (0, 1) @@ -357,23 +481,21 @@ def test_ignores_results_without_adaptive_metadata(self, mock_objective_scorer): prior_result.attack_results = { "baseline": [AttackResult(conversation_id="c", objective="o", metadata={})], } - scenario._memory = MagicMock() - scenario._memory.get_scenario_results.return_value = [prior_result] selector = AdaptiveTechniqueSelector() - scenario._rehydrate_selector_from_memory(selector=selector, known_techniques={"a"}) + with patch.object(scenario._memory, "get_scenario_results", return_value=[prior_result]): + scenario._rehydrate_selector_from_memory(selector=selector, known_techniques={"a"}) assert selector.snapshot() == {} def test_memory_load_failure_is_swallowed(self, mock_objective_scorer): from pyrit.scenario.scenarios.adaptive.selector import AdaptiveTechniqueSelector scenario = TextAdaptive(objective_scorer=mock_objective_scorer, scenario_result_id="rid") - scenario._memory = MagicMock() - scenario._memory.get_scenario_results.side_effect = RuntimeError("db down") selector = AdaptiveTechniqueSelector() - # Must not raise; selector remains empty. - scenario._rehydrate_selector_from_memory(selector=selector, known_techniques={"a"}) + with patch.object(scenario._memory, "get_scenario_results", side_effect=RuntimeError("db down")): + # Must not raise; selector remains empty. + scenario._rehydrate_selector_from_memory(selector=selector, known_techniques={"a"}) assert selector.snapshot() == {} From 61a1b7d4b8bce980f2c21562809eed6cc9447d07 Mon Sep 17 00:00:00 2001 From: hannahwestra25 Date: Tue, 19 May 2026 16:02:14 -0400 Subject: [PATCH 08/12] clean up and fix docstrings --- doc/code/scenarios/3_adaptive_scenarios.ipynb | 59 ++++++++++++------- doc/code/scenarios/3_adaptive_scenarios.py | 34 +++++++---- .../scenarios/adaptive/adaptive_scenario.py | 17 ++++++ .../scenario/scenarios/adaptive/dispatcher.py | 26 +++++++- pyrit/scenario/scenarios/adaptive/selector.py | 12 +++- .../scenarios/adaptive/text_adaptive.py | 4 ++ 6 files changed, 113 insertions(+), 39 deletions(-) diff --git a/doc/code/scenarios/3_adaptive_scenarios.ipynb b/doc/code/scenarios/3_adaptive_scenarios.ipynb index 2067a8896..bd1ddd664 100644 --- a/doc/code/scenarios/3_adaptive_scenarios.ipynb +++ b/doc/code/scenarios/3_adaptive_scenarios.ipynb @@ -16,8 +16,8 @@ "\n", "For each objective, the scenario tries up to `max_attempts_per_objective` techniques:\n", "\n", - "- With probability `epsilon`, it **explores** — picks a random technique.\n", - "- Otherwise it **exploits** — picks the technique with the highest observed success\n", + "- With probability `epsilon`, it **explores** \u2014 picks a random technique.\n", + "- Otherwise it **exploits** \u2014 picks the technique with the highest observed success\n", " rate so far.\n", "- It records the outcome and stops early on success.\n", "\n", @@ -29,8 +29,8 @@ "| Feature | Static scenarios | Adaptive scenarios |\n", "|---------------------|-----------------------------------|------------------------------------|\n", "| Technique selection | Run every selected technique | Pick per-objective from outcomes |\n", - "| Early stopping | No | Yes — stops on first success |\n", - "| Cost | O(techniques × objectives) | O(max_attempts × objectives) |\n", + "| Early stopping | No | Yes \u2014 stops on first success |\n", + "| Cost | O(techniques \u00d7 objectives) | O(max_attempts \u00d7 objectives) |\n", "\n", "`AdaptiveScenario` is the modality-agnostic base class.\n", "[`TextAdaptive`](../../../pyrit/scenario/scenarios/adaptive/text_adaptive.py) is the\n", @@ -89,7 +89,7 @@ " objective_target=objective_target,\n", ")\n", "result = await scenario.run_async() # type: ignore\n", - "await printer.print_summary_async(result) # type: ignore" + "await printer.write_async(result) # type: ignore" ] }, { @@ -99,9 +99,9 @@ "source": [ "## Tuning exploration (`epsilon`)\n", "\n", - "- `epsilon=0.0` — pure exploitation (always pick the best-known technique).\n", - "- `epsilon=1.0` — pure exploration (random every time).\n", - "- `epsilon=0.2` (default) — 20% exploration." + "- `epsilon=0.0` \u2014 pure exploitation (always pick the best-known technique).\n", + "- `epsilon=1.0` \u2014 pure exploration (random every time).\n", + "- `epsilon=0.2` (default) \u2014 20% exploration." ] }, { @@ -118,7 +118,7 @@ " dataset_config=DatasetConfiguration(dataset_names=[\"airt_hate\"], max_dataset_size=4),\n", ")\n", "explorative_result = await explorative_scenario.run_async() # type: ignore\n", - "await printer.print_summary_async(explorative_result) # type: ignore" + "await printer.write_async(explorative_result) # type: ignore" ] }, { @@ -146,7 +146,7 @@ " dataset_config=DatasetConfiguration(dataset_names=[\"airt_violence\"], max_dataset_size=4),\n", ")\n", "persistent_result = await persistent_scenario.run_async() # type: ignore\n", - "await printer.print_summary_async(persistent_result) # type: ignore" + "await printer.write_async(persistent_result) # type: ignore" ] }, { @@ -156,7 +156,7 @@ "source": [ "## Learning per harm category\n", "\n", - "By default, the scenario keeps one global success-rate table — what works on hate\n", + "By default, the scenario keeps one global success-rate table \u2014 what works on hate\n", "objectives boosts the same technique on violence objectives. Pass `harm_category_context`\n", "to learn each category independently:" ] @@ -178,7 +178,7 @@ " ),\n", ")\n", "contextual_result = await contextual_scenario.run_async() # type: ignore\n", - "await printer.print_summary_async(contextual_result) # type: ignore" + "await printer.write_async(contextual_result) # type: ignore" ] }, { @@ -208,7 +208,7 @@ " dataset_config=DatasetConfiguration(dataset_names=[\"airt_hate\"], max_dataset_size=4),\n", ")\n", "single_turn_result = await single_turn_scenario.run_async() # type: ignore\n", - "await printer.print_summary_async(single_turn_result) # type: ignore" + "await printer.write_async(single_turn_result) # type: ignore" ] }, { @@ -235,7 +235,7 @@ " dataset_config=DatasetConfiguration(dataset_names=[\"airt_hate\"], max_dataset_size=2),\n", ")\n", "deterministic_result = await deterministic_scenario.run_async() # type: ignore\n", - "await printer.print_summary_async(deterministic_result) # type: ignore" + "await printer.write_async(deterministic_result) # type: ignore" ] }, { @@ -245,15 +245,30 @@ "source": [ "## Resuming a run\n", "\n", - "Adaptive scenarios are resumable — pass `scenario_result_id=...` to the `TextAdaptive`\n", + "Adaptive scenarios are resumable \u2014 pass `scenario_result_id=...` to the `TextAdaptive`\n", "constructor and the run picks up where it left off, with prior outcomes replayed into\n", - "the selector.\n", + "the selector. Here we resume the deterministic run from the previous cell." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14", + "metadata": {}, + "outputs": [], + "source": [ + "resumed_scenario = TextAdaptive(\n", + " seed=42,\n", + " epsilon=0.3,\n", + " scenario_result_id=str(deterministic_result.id),\n", + ")\n", "\n", - "```python\n", - "resumed_scenario = TextAdaptive(scenario_result_id=\"\")\n", - "await resumed_scenario.initialize_async(objective_target=objective_target)\n", - "resumed_result = await resumed_scenario.run_async()\n", - "```" + "await resumed_scenario.initialize_async( # type: ignore\n", + " objective_target=objective_target,\n", + " dataset_config=DatasetConfiguration(dataset_names=[\"airt_hate\"], max_dataset_size=2),\n", + ")\n", + "resumed_result = await resumed_scenario.run_async() # type: ignore\n", + "await printer.write_async(resumed_result) # type: ignore" ] } ], @@ -264,4 +279,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/doc/code/scenarios/3_adaptive_scenarios.py b/doc/code/scenarios/3_adaptive_scenarios.py index 038561903..1e09d235b 100644 --- a/doc/code/scenarios/3_adaptive_scenarios.py +++ b/doc/code/scenarios/3_adaptive_scenarios.py @@ -69,7 +69,7 @@ objective_target=objective_target, ) result = await scenario.run_async() # type: ignore -await printer.print_summary_async(result) # type: ignore +await printer.write_async(result) # type: ignore # %% [markdown] # ## Tuning exploration (`epsilon`) @@ -86,7 +86,7 @@ dataset_config=DatasetConfiguration(dataset_names=["airt_hate"], max_dataset_size=4), ) explorative_result = await explorative_scenario.run_async() # type: ignore -await printer.print_summary_async(explorative_result) # type: ignore +await printer.write_async(explorative_result) # type: ignore # %% [markdown] # ## Attempts per objective @@ -102,7 +102,7 @@ dataset_config=DatasetConfiguration(dataset_names=["airt_violence"], max_dataset_size=4), ) persistent_result = await persistent_scenario.run_async() # type: ignore -await printer.print_summary_async(persistent_result) # type: ignore +await printer.write_async(persistent_result) # type: ignore # %% [markdown] # ## Learning per harm category @@ -122,7 +122,7 @@ ), ) contextual_result = await contextual_scenario.run_async() # type: ignore -await printer.print_summary_async(contextual_result) # type: ignore +await printer.write_async(contextual_result) # type: ignore # %% [markdown] # ## Restricting which techniques participate @@ -140,7 +140,7 @@ dataset_config=DatasetConfiguration(dataset_names=["airt_hate"], max_dataset_size=4), ) single_turn_result = await single_turn_scenario.run_async() # type: ignore -await printer.print_summary_async(single_turn_result) # type: ignore +await printer.write_async(single_turn_result) # type: ignore # %% [markdown] # ## Reproducible runs @@ -155,17 +155,25 @@ dataset_config=DatasetConfiguration(dataset_names=["airt_hate"], max_dataset_size=2), ) deterministic_result = await deterministic_scenario.run_async() # type: ignore -await printer.print_summary_async(deterministic_result) # type: ignore +await printer.write_async(deterministic_result) # type: ignore # %% [markdown] # ## Resuming a run # # Adaptive scenarios are resumable — pass `scenario_result_id=...` to the `TextAdaptive` # constructor and the run picks up where it left off, with prior outcomes replayed into -# the selector. -# -# ```python -# resumed_scenario = TextAdaptive(scenario_result_id="") -# await resumed_scenario.initialize_async(objective_target=objective_target) -# resumed_result = await resumed_scenario.run_async() -# ``` +# the selector. Here we resume the deterministic run from the previous cell. + +# %% +resumed_scenario = TextAdaptive( + seed=42, + epsilon=0.3, + scenario_result_id=str(deterministic_result.id), +) + +await resumed_scenario.initialize_async( # type: ignore + objective_target=objective_target, + dataset_config=DatasetConfiguration(dataset_names=["airt_hate"], max_dataset_size=2), +) +resumed_result = await resumed_scenario.run_async() # type: ignore +await printer.write_async(resumed_result) # type: ignore diff --git a/pyrit/scenario/scenarios/adaptive/adaptive_scenario.py b/pyrit/scenario/scenarios/adaptive/adaptive_scenario.py index 6edf753e2..776c46ea5 100644 --- a/pyrit/scenario/scenarios/adaptive/adaptive_scenario.py +++ b/pyrit/scenario/scenarios/adaptive/adaptive_scenario.py @@ -113,6 +113,15 @@ async def _get_atomic_attacks_async(self) -> list[AtomicAttack]: so learning accumulates across objectives. Per-objective, techniques whose ``seed_technique`` is incompatible with the seed group are filtered out; objectives left with no compatible techniques are skipped. + + Returns: + list[AtomicAttack]: One ``AtomicAttack`` per objective with at + least one compatible technique. Empty if every seed group + is incompatible with every selected technique. + + Raises: + ValueError: If ``self._objective_target`` is not set, or if + ``_build_techniques_dict`` finds no usable techniques. """ if self._objective_target is None: raise ValueError("objective_target must be set before creating attacks") @@ -154,6 +163,10 @@ def _build_techniques_dict( ``seed_technique`` and ``adversarial_chat`` so the dispatcher can reproduce the static ``AtomicAttack`` execution path per attempt. + Returns: + dict[str, TechniqueBundle]: Mapping from technique name to its + bundle, in the order selected strategies were resolved. + Raises: ValueError: If no techniques remain after filtering. Includes the requested techniques and skip reasons. @@ -204,6 +217,10 @@ def _build_atomic_for_seed_group( any) is compatible with this seed group, then constructs a dedicated ``AdaptiveDispatchAttack`` bound to this seed group. Returns ``None`` when no techniques are compatible (caller skips the objective). + + Raises: + ValueError: If ``self._objective_target`` is not set (defensive + guard; ``_get_atomic_attacks_async`` enforces this earlier). """ if self._objective_target is None: # pragma: no cover - defensive raise ValueError("objective_target must be set before creating attacks") diff --git a/pyrit/scenario/scenarios/adaptive/dispatcher.py b/pyrit/scenario/scenarios/adaptive/dispatcher.py index 8499102ca..2ff245171 100644 --- a/pyrit/scenario/scenarios/adaptive/dispatcher.py +++ b/pyrit/scenario/scenarios/adaptive/dispatcher.py @@ -142,14 +142,15 @@ def __init__( self._executor = AttackExecutor(max_concurrency=1) def _validate_context(self, *, context: AdaptiveDispatchContext) -> None: + """Ensure the context carries a non-empty objective string.""" if not context.objective or context.objective.isspace(): raise ValueError("Attack objective must be provided and non-empty") async def _setup_async(self, *, context: AdaptiveDispatchContext) -> None: - pass + """No-op: per-attempt setup is owned by the inner technique's executor.""" async def _teardown_async(self, *, context: AdaptiveDispatchContext) -> None: - pass + """No-op: per-attempt teardown is owned by the inner technique's executor.""" async def _run_inner_attack_async( self, @@ -197,6 +198,27 @@ async def _run_inner_attack_async( ) async def _perform_async(self, *, context: AdaptiveDispatchContext) -> AttackResult: + """ + Run the per-objective adaptive loop. + + Resolves the per-objective context key from ``context.memory_labels`` + (falling back to :data:`GLOBAL_CONTEXT`), then loops up to + ``max_attempts_per_objective`` times: select a technique, execute it, + record the outcome, and stop early on success. + + Args: + context (AdaptiveDispatchContext): Execution context. ``memory_labels`` + may carry :data:`ADAPTIVE_CONTEXT_LABEL` to scope the selector. + + Returns: + AttackResult: A fresh dispatcher-owned copy of the final inner + result with the dispatch trail stamped onto ``metadata`` + (see class docstring for the two-row persistence note). + + Raises: + RuntimeError: If the loop somehow ran zero attempts (unreachable + because ``max_attempts_per_objective`` is validated >= 1). + """ adaptive_context = context.memory_labels.get(ADAPTIVE_CONTEXT_LABEL, GLOBAL_CONTEXT) technique_names = list(self._techniques.keys()) diff --git a/pyrit/scenario/scenarios/adaptive/selector.py b/pyrit/scenario/scenarios/adaptive/selector.py index 967aee9d1..cd7853301 100644 --- a/pyrit/scenario/scenarios/adaptive/selector.py +++ b/pyrit/scenario/scenarios/adaptive/selector.py @@ -24,7 +24,12 @@ def global_context(_seed_attack_group: SeedAttackGroup) -> str: - """Return a single shared context for all objectives.""" + """ + Return a single shared context for all objectives. + + Returns: + str: Always :data:`GLOBAL_CONTEXT`. + """ return GLOBAL_CONTEXT @@ -33,7 +38,10 @@ def harm_category_context(seed_attack_group: SeedAttackGroup) -> str: Return a context keyed by the sorted, ``|``-joined harm categories. Multi-category seeds form their own bucket; sorting makes the key deterministic. - Returns ``UNCATEGORIZED_CONTEXT`` when no categories are set. + + Returns: + str: The ``|``-joined sorted harm categories, or :data:`UNCATEGORIZED_CONTEXT` + when the seed group has no categories. """ categories = seed_attack_group.harm_categories if not categories: diff --git a/pyrit/scenario/scenarios/adaptive/text_adaptive.py b/pyrit/scenario/scenarios/adaptive/text_adaptive.py index bc08edd9c..1d8706fa5 100644 --- a/pyrit/scenario/scenarios/adaptive/text_adaptive.py +++ b/pyrit/scenario/scenarios/adaptive/text_adaptive.py @@ -63,17 +63,20 @@ class TextAdaptive(AdaptiveScenario): @classmethod def get_strategy_class(cls) -> type[ScenarioStrategy]: + """Return the strategy enum for this scenario, building it once on first access.""" if cls._cached_strategy_class is None: cls._cached_strategy_class = _build_text_adaptive_strategy() return cls._cached_strategy_class @classmethod def get_default_strategy(cls) -> ScenarioStrategy: + """Return the default strategy aggregate (resolves to every ``default``-tagged technique).""" strategy_class = cls.get_strategy_class() return strategy_class("default") @classmethod def required_datasets(cls) -> list[str]: + """Return the dataset names this scenario expects when no override is provided.""" return [ "airt_hate", "airt_fairness", @@ -86,6 +89,7 @@ def required_datasets(cls) -> list[str]: @classmethod def default_dataset_config(cls) -> DatasetConfiguration: + """Return the default :class:`DatasetConfiguration` (required datasets, capped at 4 per dataset).""" return DatasetConfiguration(dataset_names=cls.required_datasets(), max_dataset_size=4) @apply_defaults From 32d8b5e52d60fac83ead553088ba67c138e68148 Mon Sep 17 00:00:00 2001 From: hannahwestra25 Date: Tue, 19 May 2026 16:57:00 -0400 Subject: [PATCH 09/12] simplify notebook and pre-commit --- doc/code/scenarios/3_adaptive_scenarios.ipynb | 202 +++++++----------- doc/code/scenarios/3_adaptive_scenarios.py | 146 ++++++------- .../scenarios/adaptive/adaptive_scenario.py | 18 +- .../scenario/scenarios/adaptive/dispatcher.py | 7 +- pyrit/scenario/scenarios/adaptive/selector.py | 11 +- .../scenarios/adaptive/text_adaptive.py | 15 +- 6 files changed, 185 insertions(+), 214 deletions(-) diff --git a/doc/code/scenarios/3_adaptive_scenarios.ipynb b/doc/code/scenarios/3_adaptive_scenarios.ipynb index bd1ddd664..7be2b738e 100644 --- a/doc/code/scenarios/3_adaptive_scenarios.ipynb +++ b/doc/code/scenarios/3_adaptive_scenarios.ipynb @@ -16,8 +16,8 @@ "\n", "For each objective, the scenario tries up to `max_attempts_per_objective` techniques:\n", "\n", - "- With probability `epsilon`, it **explores** \u2014 picks a random technique.\n", - "- Otherwise it **exploits** \u2014 picks the technique with the highest observed success\n", + "- With probability `epsilon`, it **explores** — picks a random technique.\n", + "- Otherwise it **exploits** — picks the technique with the highest observed success\n", " rate so far.\n", "- It records the outcome and stops early on success.\n", "\n", @@ -29,8 +29,8 @@ "| Feature | Static scenarios | Adaptive scenarios |\n", "|---------------------|-----------------------------------|------------------------------------|\n", "| Technique selection | Run every selected technique | Pick per-objective from outcomes |\n", - "| Early stopping | No | Yes \u2014 stops on first success |\n", - "| Cost | O(techniques \u00d7 objectives) | O(max_attempts \u00d7 objectives) |\n", + "| Early stopping | No | Yes — stops on first success |\n", + "| Cost | O(techniques × objectives) | O(max_attempts × objectives) |\n", "\n", "`AdaptiveScenario` is the modality-agnostic base class.\n", "[`TextAdaptive`](../../../pyrit/scenario/scenarios/adaptive/text_adaptive.py) is the\n", @@ -97,178 +97,138 @@ "id": "5", "metadata": {}, "source": [ - "## Tuning exploration (`epsilon`)\n", + "## Configuring a run\n", "\n", - "- `epsilon=0.0` \u2014 pure exploitation (always pick the best-known technique).\n", - "- `epsilon=1.0` \u2014 pure exploration (random every time).\n", - "- `epsilon=0.2` (default) \u2014 20% exploration." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6", - "metadata": {}, - "outputs": [], - "source": [ - "explorative_scenario = TextAdaptive(epsilon=0.5)\n", + "All the knobs below are constructor or `initialize_async` arguments — combine whichever\n", + "you need on a single scenario instance:\n", "\n", - "await explorative_scenario.initialize_async( # type: ignore\n", - " objective_target=objective_target,\n", - " dataset_config=DatasetConfiguration(dataset_names=[\"airt_hate\"], max_dataset_size=4),\n", - ")\n", - "explorative_result = await explorative_scenario.run_async() # type: ignore\n", - "await printer.write_async(explorative_result) # type: ignore" - ] - }, - { - "cell_type": "markdown", - "id": "7", - "metadata": {}, - "source": [ - "## Attempts per objective\n", + "- **`epsilon`** — exploration probability. `0.0` is pure exploit, `1.0` is pure random,\n", + " `0.2` (default) is 20% exploration.\n", + "- **`max_attempts_per_objective`** — caps techniques tried per objective. Higher means\n", + " more chances to succeed and more API calls.\n", + "- **`context_extractor`** — partitions the success-rate table. The default\n", + " `global_context` keeps one shared table; `harm_category_context` learns each harm\n", + " category independently. Custom callables of type `Callable[[SeedAttackGroup], str]`\n", + " are supported.\n", + "- **`seed`** — makes every selection decision deterministic.\n", + "- **`scenario_strategies`** (on `initialize_async`) — restricts which techniques the\n", + " selector can pick from. Use `TextAdaptive.get_strategy_class()` to access the enum.\n", "\n", - "`max_attempts_per_objective` caps how many techniques are tried per objective before\n", - "moving on. Higher = more chances to succeed, more API calls." + "The cell below exercises all of them at once." ] }, { "cell_type": "code", "execution_count": null, - "id": "8", + "id": "6", "metadata": {}, "outputs": [], "source": [ - "persistent_scenario = TextAdaptive(max_attempts_per_objective=5)\n", + "strategy_class = TextAdaptive.get_strategy_class()\n", "\n", - "await persistent_scenario.initialize_async( # type: ignore\n", - " objective_target=objective_target,\n", - " dataset_config=DatasetConfiguration(dataset_names=[\"airt_violence\"], max_dataset_size=4),\n", + "configured_scenario = TextAdaptive(\n", + " epsilon=0.3,\n", + " max_attempts_per_objective=5,\n", + " context_extractor=harm_category_context,\n", + " seed=42,\n", ")\n", - "persistent_result = await persistent_scenario.run_async() # type: ignore\n", - "await printer.write_async(persistent_result) # type: ignore" - ] - }, - { - "cell_type": "markdown", - "id": "9", - "metadata": {}, - "source": [ - "## Learning per harm category\n", - "\n", - "By default, the scenario keeps one global success-rate table \u2014 what works on hate\n", - "objectives boosts the same technique on violence objectives. Pass `harm_category_context`\n", - "to learn each category independently:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "10", - "metadata": {}, - "outputs": [], - "source": [ - "contextual_scenario = TextAdaptive(context_extractor=harm_category_context)\n", "\n", - "await contextual_scenario.initialize_async( # type: ignore\n", + "await configured_scenario.initialize_async( # type: ignore\n", " objective_target=objective_target,\n", + " scenario_strategies=[strategy_class(\"single_turn\")],\n", " dataset_config=DatasetConfiguration(\n", " dataset_names=[\"airt_hate\", \"airt_violence\"],\n", " max_dataset_size=4,\n", " ),\n", ")\n", - "contextual_result = await contextual_scenario.run_async() # type: ignore\n", - "await printer.write_async(contextual_result) # type: ignore" + "configured_result = await configured_scenario.run_async() # type: ignore\n", + "await printer.write_async(configured_result) # type: ignore" ] }, { "cell_type": "markdown", - "id": "11", + "id": "7", "metadata": {}, "source": [ - "## Restricting which techniques participate\n", + "## Resuming a run\n", "\n", - "Use `scenario_strategies` to limit which techniques the scenario can pick from." + "Adaptive scenarios are resumable — pass `scenario_result_id=...` to the `TextAdaptive`\n", + "constructor and the run picks up where it left off, with prior outcomes replayed into\n", + "the selector. Resume must use the same configuration as the original run." ] }, { "cell_type": "code", "execution_count": null, - "id": "12", + "id": "8", "metadata": {}, "outputs": [], "source": [ - "strategy_class = TextAdaptive.get_strategy_class()\n", - "\n", - "single_turn_scenario = TextAdaptive()\n", + "resumed_scenario = TextAdaptive(\n", + " epsilon=0.3,\n", + " max_attempts_per_objective=5,\n", + " context_extractor=harm_category_context,\n", + " seed=42,\n", + " scenario_result_id=str(configured_result.id),\n", + ")\n", "\n", - "await single_turn_scenario.initialize_async( # type: ignore\n", + "await resumed_scenario.initialize_async( # type: ignore\n", " objective_target=objective_target,\n", " scenario_strategies=[strategy_class(\"single_turn\")],\n", - " dataset_config=DatasetConfiguration(dataset_names=[\"airt_hate\"], max_dataset_size=4),\n", + " dataset_config=DatasetConfiguration(\n", + " dataset_names=[\"airt_hate\", \"airt_violence\"],\n", + " max_dataset_size=4,\n", + " ),\n", ")\n", - "single_turn_result = await single_turn_scenario.run_async() # type: ignore\n", - "await printer.write_async(single_turn_result) # type: ignore" + "resumed_result = await resumed_scenario.run_async() # type: ignore\n", + "await printer.write_async(resumed_result) # type: ignore" ] }, { "cell_type": "markdown", - "id": "13", + "id": "9", "metadata": {}, "source": [ - "## Reproducible runs\n", + "## Inspecting which techniques were tried\n", "\n", - "Pass `seed` to make every selection decision deterministic." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "14", - "metadata": {}, - "outputs": [], - "source": [ - "deterministic_scenario = TextAdaptive(seed=42, epsilon=0.3)\n", + "The dispatcher stamps every objective's `AttackResult.metadata` with:\n", "\n", - "await deterministic_scenario.initialize_async( # type: ignore\n", - " objective_target=objective_target,\n", - " dataset_config=DatasetConfiguration(dataset_names=[\"airt_hate\"], max_dataset_size=2),\n", - ")\n", - "deterministic_result = await deterministic_scenario.run_async() # type: ignore\n", - "await printer.write_async(deterministic_result) # type: ignore" - ] - }, - { - "cell_type": "markdown", - "id": "15", - "metadata": {}, - "source": [ - "## Resuming a run\n", + "- `adaptive_context` — the bucket key from the `context_extractor`.\n", + "- `adaptive_attempts` — the ordered list of `{\"technique\", \"outcome\"}` dicts\n", + " recording exactly which techniques the selector picked and what happened.\n", "\n", - "Adaptive scenarios are resumable \u2014 pass `scenario_result_id=...` to the `TextAdaptive`\n", - "constructor and the run picks up where it left off, with prior outcomes replayed into\n", - "the selector. Here we resume the deterministic run from the previous cell." + "Walk that metadata to see the per-objective trail and aggregate counts." ] }, { "cell_type": "code", "execution_count": null, - "id": "14", + "id": "10", "metadata": {}, "outputs": [], "source": [ - "resumed_scenario = TextAdaptive(\n", - " seed=42,\n", - " epsilon=0.3,\n", - " scenario_result_id=str(deterministic_result.id),\n", - ")\n", - "\n", - "await resumed_scenario.initialize_async( # type: ignore\n", - " objective_target=objective_target,\n", - " dataset_config=DatasetConfiguration(dataset_names=[\"airt_hate\"], max_dataset_size=2),\n", - ")\n", - "resumed_result = await resumed_scenario.run_async() # type: ignore\n", - "await printer.write_async(resumed_result) # type: ignore" + "from collections import Counter\n", + "\n", + "# Per-objective trail\n", + "for results in resumed_result.attack_results.values():\n", + " for r in results:\n", + " attempts = r.metadata.get(\"adaptive_attempts\", [])\n", + " trail = \" → \".join(f\"{a['technique']}({a['outcome']})\" for a in attempts)\n", + " print(f\"[{r.outcome.value:7s}] {r.objective!r}: {trail}\")\n", + "\n", + "# Aggregate per-technique pick counts and success rate across the run\n", + "picks: Counter[str] = Counter()\n", + "wins: Counter[str] = Counter()\n", + "for results in resumed_result.attack_results.values():\n", + " for r in results:\n", + " for step in r.metadata.get(\"adaptive_attempts\", []):\n", + " picks[step[\"technique\"]] += 1\n", + " if step[\"outcome\"] == \"success\":\n", + " wins[step[\"technique\"]] += 1\n", + "\n", + "print(\"\\nTechnique wins / picks rate\")\n", + "for technique, n in picks.most_common():\n", + " print(f\"{technique:20s} {wins[technique]:>4} / {n:<4} {wins[technique] / n:.0%}\")" ] } ], @@ -279,4 +239,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/doc/code/scenarios/3_adaptive_scenarios.py b/doc/code/scenarios/3_adaptive_scenarios.py index 1e09d235b..96e3320bb 100644 --- a/doc/code/scenarios/3_adaptive_scenarios.py +++ b/doc/code/scenarios/3_adaptive_scenarios.py @@ -72,108 +72,104 @@ await printer.write_async(result) # type: ignore # %% [markdown] -# ## Tuning exploration (`epsilon`) +# ## Configuring a run # -# - `epsilon=0.0` — pure exploitation (always pick the best-known technique). -# - `epsilon=1.0` — pure exploration (random every time). -# - `epsilon=0.2` (default) — 20% exploration. - -# %% -explorative_scenario = TextAdaptive(epsilon=0.5) - -await explorative_scenario.initialize_async( # type: ignore - objective_target=objective_target, - dataset_config=DatasetConfiguration(dataset_names=["airt_hate"], max_dataset_size=4), -) -explorative_result = await explorative_scenario.run_async() # type: ignore -await printer.write_async(explorative_result) # type: ignore - -# %% [markdown] -# ## Attempts per objective +# All the knobs below are constructor or `initialize_async` arguments — combine whichever +# you need on a single scenario instance: # -# `max_attempts_per_objective` caps how many techniques are tried per objective before -# moving on. Higher = more chances to succeed, more API calls. +# - **`epsilon`** — exploration probability. `0.0` is pure exploit, `1.0` is pure random, +# `0.2` (default) is 20% exploration. +# - **`max_attempts_per_objective`** — caps techniques tried per objective. Higher means +# more chances to succeed and more API calls. +# - **`context_extractor`** — partitions the success-rate table. The default +# `global_context` keeps one shared table; `harm_category_context` learns each harm +# category independently. Custom callables of type `Callable[[SeedAttackGroup], str]` +# are supported. +# - **`seed`** — makes every selection decision deterministic. +# - **`scenario_strategies`** (on `initialize_async`) — restricts which techniques the +# selector can pick from. Use `TextAdaptive.get_strategy_class()` to access the enum. +# +# The cell below exercises all of them at once. # %% -persistent_scenario = TextAdaptive(max_attempts_per_objective=5) +strategy_class = TextAdaptive.get_strategy_class() -await persistent_scenario.initialize_async( # type: ignore - objective_target=objective_target, - dataset_config=DatasetConfiguration(dataset_names=["airt_violence"], max_dataset_size=4), +configured_scenario = TextAdaptive( + epsilon=0.3, + max_attempts_per_objective=5, + context_extractor=harm_category_context, + seed=42, ) -persistent_result = await persistent_scenario.run_async() # type: ignore -await printer.write_async(persistent_result) # type: ignore - -# %% [markdown] -# ## Learning per harm category -# -# By default, the scenario keeps one global success-rate table — what works on hate -# objectives boosts the same technique on violence objectives. Pass `harm_category_context` -# to learn each category independently: - -# %% -contextual_scenario = TextAdaptive(context_extractor=harm_category_context) -await contextual_scenario.initialize_async( # type: ignore +await configured_scenario.initialize_async( # type: ignore objective_target=objective_target, + scenario_strategies=[strategy_class("single_turn")], dataset_config=DatasetConfiguration( dataset_names=["airt_hate", "airt_violence"], max_dataset_size=4, ), ) -contextual_result = await contextual_scenario.run_async() # type: ignore -await printer.write_async(contextual_result) # type: ignore - -# %% [markdown] -# ## Restricting which techniques participate -# -# Use `scenario_strategies` to limit which techniques the scenario can pick from. - -# %% -strategy_class = TextAdaptive.get_strategy_class() - -single_turn_scenario = TextAdaptive() - -await single_turn_scenario.initialize_async( # type: ignore - objective_target=objective_target, - scenario_strategies=[strategy_class("single_turn")], - dataset_config=DatasetConfiguration(dataset_names=["airt_hate"], max_dataset_size=4), -) -single_turn_result = await single_turn_scenario.run_async() # type: ignore -await printer.write_async(single_turn_result) # type: ignore - -# %% [markdown] -# ## Reproducible runs -# -# Pass `seed` to make every selection decision deterministic. - -# %% -deterministic_scenario = TextAdaptive(seed=42, epsilon=0.3) - -await deterministic_scenario.initialize_async( # type: ignore - objective_target=objective_target, - dataset_config=DatasetConfiguration(dataset_names=["airt_hate"], max_dataset_size=2), -) -deterministic_result = await deterministic_scenario.run_async() # type: ignore -await printer.write_async(deterministic_result) # type: ignore +configured_result = await configured_scenario.run_async() # type: ignore +await printer.write_async(configured_result) # type: ignore # %% [markdown] # ## Resuming a run # # Adaptive scenarios are resumable — pass `scenario_result_id=...` to the `TextAdaptive` # constructor and the run picks up where it left off, with prior outcomes replayed into -# the selector. Here we resume the deterministic run from the previous cell. +# the selector. Resume must use the same configuration as the original run. # %% resumed_scenario = TextAdaptive( - seed=42, epsilon=0.3, - scenario_result_id=str(deterministic_result.id), + max_attempts_per_objective=5, + context_extractor=harm_category_context, + seed=42, + scenario_result_id=str(configured_result.id), ) await resumed_scenario.initialize_async( # type: ignore objective_target=objective_target, - dataset_config=DatasetConfiguration(dataset_names=["airt_hate"], max_dataset_size=2), + scenario_strategies=[strategy_class("single_turn")], + dataset_config=DatasetConfiguration( + dataset_names=["airt_hate", "airt_violence"], + max_dataset_size=4, + ), ) resumed_result = await resumed_scenario.run_async() # type: ignore await printer.write_async(resumed_result) # type: ignore + +# %% [markdown] +# ## Inspecting which techniques were tried +# +# The dispatcher stamps every objective's `AttackResult.metadata` with: +# +# - `adaptive_context` — the bucket key from the `context_extractor`. +# - `adaptive_attempts` — the ordered list of `{"technique", "outcome"}` dicts +# recording exactly which techniques the selector picked and what happened. +# +# Walk that metadata to see the per-objective trail and aggregate counts. + +# %% +from collections import Counter + +# Per-objective trail +for results in resumed_result.attack_results.values(): + for r in results: + attempts = r.metadata.get("adaptive_attempts", []) + trail = " → ".join(f"{a['technique']}({a['outcome']})" for a in attempts) + print(f"[{r.outcome.value:7s}] {r.objective!r}: {trail}") + +# Aggregate per-technique pick counts and success rate across the run +picks: Counter[str] = Counter() +wins: Counter[str] = Counter() +for results in resumed_result.attack_results.values(): + for r in results: + for step in r.metadata.get("adaptive_attempts", []): + picks[step["technique"]] += 1 + if step["outcome"] == "success": + wins[step["technique"]] += 1 + +print("\nTechnique wins / picks rate") +for technique, n in picks.most_common(): + print(f"{technique:20s} {wins[technique]:>4} / {n:<4} {wins[technique] / n:.0%}") diff --git a/pyrit/scenario/scenarios/adaptive/adaptive_scenario.py b/pyrit/scenario/scenarios/adaptive/adaptive_scenario.py index 776c46ea5..723849ce9 100644 --- a/pyrit/scenario/scenarios/adaptive/adaptive_scenario.py +++ b/pyrit/scenario/scenarios/adaptive/adaptive_scenario.py @@ -215,8 +215,11 @@ def _build_atomic_for_seed_group( Filters the technique pool down to those whose ``seed_technique`` (if any) is compatible with this seed group, then constructs a dedicated - ``AdaptiveDispatchAttack`` bound to this seed group. Returns ``None`` - when no techniques are compatible (caller skips the objective). + ``AdaptiveDispatchAttack`` bound to this seed group. + + Returns: + AtomicAttack | None: The constructed atomic attack, or ``None`` when + no techniques are compatible (caller skips the objective). Raises: ValueError: If ``self._objective_target`` is not set (defensive @@ -225,12 +228,11 @@ def _build_atomic_for_seed_group( if self._objective_target is None: # pragma: no cover - defensive raise ValueError("objective_target must be set before creating attacks") - compatible: dict[str, TechniqueBundle] = {} - for name, bundle in techniques.items(): - if bundle.seed_technique is None or seed_group.is_compatible_with_technique( - technique=bundle.seed_technique - ): - compatible[name] = bundle + compatible: dict[str, TechniqueBundle] = { + name: bundle + for name, bundle in techniques.items() + if bundle.seed_technique is None or seed_group.is_compatible_with_technique(technique=bundle.seed_technique) + } if not compatible: logger.warning( diff --git a/pyrit/scenario/scenarios/adaptive/dispatcher.py b/pyrit/scenario/scenarios/adaptive/dispatcher.py index 2ff245171..46808bfde 100644 --- a/pyrit/scenario/scenarios/adaptive/dispatcher.py +++ b/pyrit/scenario/scenarios/adaptive/dispatcher.py @@ -142,7 +142,12 @@ def __init__( self._executor = AttackExecutor(max_concurrency=1) def _validate_context(self, *, context: AdaptiveDispatchContext) -> None: - """Ensure the context carries a non-empty objective string.""" + """ + Ensure the context carries a non-empty objective string. + + Raises: + ValueError: If ``context.objective`` is empty or whitespace-only. + """ if not context.objective or context.objective.isspace(): raise ValueError("Attack objective must be provided and non-empty") diff --git a/pyrit/scenario/scenarios/adaptive/selector.py b/pyrit/scenario/scenarios/adaptive/selector.py index cd7853301..d2d9e63a7 100644 --- a/pyrit/scenario/scenarios/adaptive/selector.py +++ b/pyrit/scenario/scenarios/adaptive/selector.py @@ -13,14 +13,12 @@ if TYPE_CHECKING: from pyrit.models.seeds.seed_attack_group import SeedAttackGroup - -ContextExtractor = Callable[["SeedAttackGroup"], str] """Maps a ``SeedAttackGroup`` to an adaptive context key.""" - -GLOBAL_CONTEXT: str = "_global" +ContextExtractor = Callable[["SeedAttackGroup"], str] """Default context: all objectives share one selection table.""" -UNCATEGORIZED_CONTEXT: str = "_uncategorized" +GLOBAL_CONTEXT: str = "_global" """Fallback context for seed groups with no harm category metadata.""" +UNCATEGORIZED_CONTEXT: str = "_uncategorized" def global_context(_seed_attack_group: SeedAttackGroup) -> str: @@ -173,6 +171,9 @@ def _estimate(self, *, context: str, technique: str) -> float: ``pool_threshold`` local attempts. Callers must already hold ``self._lock``. + + Returns: + float: Laplace-smoothed success-rate estimate in ``(0, 1)``. """ local_s, local_n = self._counts.get((context, technique), (0, 0)) if local_n >= self._pool_threshold: diff --git a/pyrit/scenario/scenarios/adaptive/text_adaptive.py b/pyrit/scenario/scenarios/adaptive/text_adaptive.py index 1d8706fa5..4bbbe7ff4 100644 --- a/pyrit/scenario/scenarios/adaptive/text_adaptive.py +++ b/pyrit/scenario/scenarios/adaptive/text_adaptive.py @@ -14,24 +14,31 @@ from __future__ import annotations import logging -from typing import ClassVar +from typing import TYPE_CHECKING, ClassVar from pyrit.common import apply_defaults from pyrit.registry.tag_query import TagQuery from pyrit.scenario.core.dataset_configuration import DatasetConfiguration -from pyrit.scenario.core.scenario_strategy import ScenarioStrategy from pyrit.scenario.scenarios.adaptive.adaptive_scenario import AdaptiveScenario from pyrit.scenario.scenarios.adaptive.selector import ( ContextExtractor, global_context, ) -from pyrit.score import TrueFalseScorer + +if TYPE_CHECKING: + from pyrit.scenario.core.scenario_strategy import ScenarioStrategy + from pyrit.score import TrueFalseScorer logger = logging.getLogger(__name__) def _build_text_adaptive_strategy() -> type[ScenarioStrategy]: - """Build the strategy enum from the core scenario-techniques catalog.""" + """ + Build the strategy enum from the core scenario-techniques catalog. + + Returns: + type[ScenarioStrategy]: The dynamically-built strategy enum class. + """ from pyrit.registry.object_registries.attack_technique_registry import ( AttackTechniqueRegistry, ) From f86c191163d417c9cf3d2536b2e087af297e06e2 Mon Sep 17 00:00:00 2001 From: hannahwestra25 Date: Thu, 21 May 2026 16:40:03 -0400 Subject: [PATCH 10/12] feat: address PR #1760 review feedback - Remove prompt_sending from adaptive pool; enable baseline comparison - Expose max_attempts_per_objective via supported_parameters() (scam.py pattern) - Rename AdaptiveTechniqueSelector -> EpsilonGreedyTechniqueSelector - Extract TechniqueSelector Protocol; accept custom selector via kwarg - Per-decision RNG derivation (SHA-256) for resume reproducibility - Drop uuid.uuid4() fallback for objective IDs - Per-dataset atomic attacks (one AtomicAttack per dataset, not per objective) - AdaptiveDispatchParams with per-call seed_group and compatibility filtering - Context extraction moved to dispatcher - Rehydration uses get_attack_results with attribution_data filtering - Split selector.py into selectors/ folder (protocol.py + epsilon_greedy.py) - Update notebooks for new API patterns Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- doc/code/scenarios/3_adaptive_scenarios.ipynb | 31 ++- doc/code/scenarios/3_adaptive_scenarios.py | 31 ++- pyrit/scenario/scenarios/adaptive/__init__.py | 10 +- .../scenarios/adaptive/adaptive_scenario.py | 214 ++++++++------- .../scenario/scenarios/adaptive/dispatcher.py | 184 +++++++++---- .../scenarios/adaptive/selectors/__init__.py | 26 ++ .../epsilon_greedy.py} | 85 +++--- .../scenarios/adaptive/selectors/protocol.py | 66 +++++ .../scenarios/adaptive/text_adaptive.py | 62 +++-- .../scenarios/adaptive/test_dispatcher.py | 60 ++--- ...est_selector.py => test_epsilon_greedy.py} | 63 ++--- .../scenarios/adaptive/test_protocol.py | 46 ++++ .../scenarios/adaptive/test_text_adaptive.py | 247 ++++++++++-------- 13 files changed, 675 insertions(+), 450 deletions(-) create mode 100644 pyrit/scenario/scenarios/adaptive/selectors/__init__.py rename pyrit/scenario/scenarios/adaptive/{selector.py => selectors/epsilon_greedy.py} (71%) create mode 100644 pyrit/scenario/scenarios/adaptive/selectors/protocol.py rename tests/unit/scenario/scenarios/adaptive/{test_selector.py => test_epsilon_greedy.py} (81%) create mode 100644 tests/unit/scenario/scenarios/adaptive/test_protocol.py diff --git a/doc/code/scenarios/3_adaptive_scenarios.ipynb b/doc/code/scenarios/3_adaptive_scenarios.ipynb index 7be2b738e..88996496e 100644 --- a/doc/code/scenarios/3_adaptive_scenarios.ipynb +++ b/doc/code/scenarios/3_adaptive_scenarios.ipynb @@ -73,7 +73,8 @@ "source": [ "## Basic usage\n", "\n", - "Defaults: `epsilon=0.2`, `max_attempts_per_objective=3`, the subclass's default datasets." + "Defaults: `max_attempts_per_objective=3`, epsilon-greedy selector with `epsilon=0.2`,\n", + "the subclass’s default datasets." ] }, { @@ -99,18 +100,16 @@ "source": [ "## Configuring a run\n", "\n", - "All the knobs below are constructor or `initialize_async` arguments — combine whichever\n", - "you need on a single scenario instance:\n", - "\n", - "- **`epsilon`** — exploration probability. `0.0` is pure exploit, `1.0` is pure random,\n", - " `0.2` (default) is 20% exploration.\n", "- **`max_attempts_per_objective`** — caps techniques tried per objective. Higher means\n", - " more chances to succeed and more API calls.\n", + " more chances to succeed and more API calls. Set via `set_params_from_args`.\n", + "- **`selector`** — a pre-built `TechniqueSelector` instance. Pass an\n", + " `EpsilonGreedyTechniqueSelector(epsilon=..., pool_threshold=..., random_seed=...)`\n", + " to tune the selection algorithm. Defaults to `EpsilonGreedyTechniqueSelector()`\n", + " (`epsilon=0.2`, `pool_threshold=3`).\n", "- **`context_extractor`** — partitions the success-rate table. The default\n", " `global_context` keeps one shared table; `harm_category_context` learns each harm\n", " category independently. Custom callables of type `Callable[[SeedAttackGroup], str]`\n", " are supported.\n", - "- **`seed`** — makes every selection decision deterministic.\n", "- **`scenario_strategies`** (on `initialize_async`) — restricts which techniques the\n", " selector can pick from. Use `TextAdaptive.get_strategy_class()` to access the enum.\n", "\n", @@ -124,13 +123,16 @@ "metadata": {}, "outputs": [], "source": [ + "from pyrit.scenario.scenarios.adaptive import EpsilonGreedyTechniqueSelector\n", + "\n", "strategy_class = TextAdaptive.get_strategy_class()\n", "\n", "configured_scenario = TextAdaptive(\n", - " epsilon=0.3,\n", - " max_attempts_per_objective=5,\n", " context_extractor=harm_category_context,\n", - " seed=42,\n", + " selector=EpsilonGreedyTechniqueSelector(epsilon=0.3, random_seed=42),\n", + ")\n", + "configured_scenario.set_params_from_args(\n", + " args={\"max_attempts_per_objective\": 5}\n", ")\n", "\n", "await configured_scenario.initialize_async( # type: ignore\n", @@ -165,12 +167,13 @@ "outputs": [], "source": [ "resumed_scenario = TextAdaptive(\n", - " epsilon=0.3,\n", - " max_attempts_per_objective=5,\n", " context_extractor=harm_category_context,\n", - " seed=42,\n", + " selector=EpsilonGreedyTechniqueSelector(epsilon=0.3, random_seed=42),\n", " scenario_result_id=str(configured_result.id),\n", ")\n", + "resumed_scenario.set_params_from_args(\n", + " args={\"max_attempts_per_objective\": 5}\n", + ")\n", "\n", "await resumed_scenario.initialize_async( # type: ignore\n", " objective_target=objective_target,\n", diff --git a/doc/code/scenarios/3_adaptive_scenarios.py b/doc/code/scenarios/3_adaptive_scenarios.py index 96e3320bb..a0d38ec30 100644 --- a/doc/code/scenarios/3_adaptive_scenarios.py +++ b/doc/code/scenarios/3_adaptive_scenarios.py @@ -60,7 +60,8 @@ # %% [markdown] # ## Basic usage # -# Defaults: `epsilon=0.2`, `max_attempts_per_objective=3`, the subclass's default datasets. +# Defaults: `max_attempts_per_objective=3`, epsilon-greedy selector with `epsilon=0.2`, +# the subclass's default datasets. # %% scenario = TextAdaptive() @@ -74,31 +75,32 @@ # %% [markdown] # ## Configuring a run # -# All the knobs below are constructor or `initialize_async` arguments — combine whichever -# you need on a single scenario instance: -# -# - **`epsilon`** — exploration probability. `0.0` is pure exploit, `1.0` is pure random, -# `0.2` (default) is 20% exploration. # - **`max_attempts_per_objective`** — caps techniques tried per objective. Higher means -# more chances to succeed and more API calls. +# more chances to succeed and more API calls. Set via `set_params_from_args`. +# - **`selector`** — a pre-built ``TechniqueSelector`` instance. Pass an +# ``EpsilonGreedyTechniqueSelector(epsilon=..., pool_threshold=..., random_seed=...)`` +# to tune the selection algorithm. Defaults to ``EpsilonGreedyTechniqueSelector()`` +# (``epsilon=0.2``, ``pool_threshold=3``). # - **`context_extractor`** — partitions the success-rate table. The default # `global_context` keeps one shared table; `harm_category_context` learns each harm # category independently. Custom callables of type `Callable[[SeedAttackGroup], str]` # are supported. -# - **`seed`** — makes every selection decision deterministic. # - **`scenario_strategies`** (on `initialize_async`) — restricts which techniques the # selector can pick from. Use `TextAdaptive.get_strategy_class()` to access the enum. # # The cell below exercises all of them at once. # %% +from pyrit.scenario.scenarios.adaptive import EpsilonGreedyTechniqueSelector + strategy_class = TextAdaptive.get_strategy_class() configured_scenario = TextAdaptive( - epsilon=0.3, - max_attempts_per_objective=5, context_extractor=harm_category_context, - seed=42, + selector=EpsilonGreedyTechniqueSelector(epsilon=0.3, random_seed=42), +) +configured_scenario.set_params_from_args( + args={"max_attempts_per_objective": 5} ) await configured_scenario.initialize_async( # type: ignore @@ -121,12 +123,13 @@ # %% resumed_scenario = TextAdaptive( - epsilon=0.3, - max_attempts_per_objective=5, context_extractor=harm_category_context, - seed=42, + selector=EpsilonGreedyTechniqueSelector(epsilon=0.3, random_seed=42), scenario_result_id=str(configured_result.id), ) +resumed_scenario.set_params_from_args( + args={"max_attempts_per_objective": 5} +) await resumed_scenario.initialize_async( # type: ignore objective_target=objective_target, diff --git a/pyrit/scenario/scenarios/adaptive/__init__.py b/pyrit/scenario/scenarios/adaptive/__init__.py index d0bd978c2..6ba741563 100644 --- a/pyrit/scenario/scenarios/adaptive/__init__.py +++ b/pyrit/scenario/scenarios/adaptive/__init__.py @@ -7,10 +7,12 @@ from pyrit.scenario.scenarios.adaptive.dispatcher import ( ADAPTIVE_CONTEXT_LABEL, AdaptiveDispatchAttack, + AdaptiveDispatchParams, ) -from pyrit.scenario.scenarios.adaptive.selector import ( - AdaptiveTechniqueSelector, +from pyrit.scenario.scenarios.adaptive.selectors import ( ContextExtractor, + EpsilonGreedyTechniqueSelector, + TechniqueSelector, global_context, harm_category_context, ) @@ -19,9 +21,11 @@ __all__ = [ "ADAPTIVE_CONTEXT_LABEL", "AdaptiveDispatchAttack", + "AdaptiveDispatchParams", "AdaptiveScenario", - "AdaptiveTechniqueSelector", "ContextExtractor", + "EpsilonGreedyTechniqueSelector", + "TechniqueSelector", "TextAdaptive", "global_context", "harm_category_context", diff --git a/pyrit/scenario/scenarios/adaptive/adaptive_scenario.py b/pyrit/scenario/scenarios/adaptive/adaptive_scenario.py index 723849ce9..862565d8e 100644 --- a/pyrit/scenario/scenarios/adaptive/adaptive_scenario.py +++ b/pyrit/scenario/scenarios/adaptive/adaptive_scenario.py @@ -3,36 +3,34 @@ """ ``AdaptiveScenario`` — modality-agnostic base for scenarios that pick attack -techniques per-objective using an ``AdaptiveTechniqueSelector``. +techniques per-objective using a ``TechniqueSelector``. -Owns selector wiring, dispatcher construction, per-objective atomic-attack +Owns selector wiring, dispatcher construction, per-dataset atomic-attack emission, and resume rehydration. Concrete subclasses (``TextAdaptive``, future ``ImageAdaptive`` / ``AudioAdaptive``) only declare strategy class, default datasets, version, and atomic-attack prefix. -Baseline policy is ``Forbidden``: ``prompt_sending`` participates as one of -the selector's techniques rather than being prepended. +Baseline policy is ``Enabled``: prompt_sending runs as a separate baseline +comparison and is excluded from the adaptive technique pool. """ from __future__ import annotations import logging -import random -import uuid from typing import TYPE_CHECKING, ClassVar from pyrit.executor.attack import AttackScoringConfig from pyrit.scenario.core.atomic_attack import AtomicAttack from pyrit.scenario.core.attack_technique import AttackTechnique -from pyrit.scenario.core.scenario import BaselinePolicy, Scenario +from pyrit.scenario.core.scenario import BaselineAttackPolicy, Scenario from pyrit.scenario.scenarios.adaptive.dispatcher import ( - ADAPTIVE_CONTEXT_LABEL, AdaptiveDispatchAttack, TechniqueBundle, ) -from pyrit.scenario.scenarios.adaptive.selector import ( - AdaptiveTechniqueSelector, +from pyrit.scenario.scenarios.adaptive.selectors import ( + EpsilonGreedyTechniqueSelector, ContextExtractor, + TechniqueSelector, global_context, ) @@ -54,7 +52,7 @@ class AdaptiveScenario(Scenario): rehydration are handled here. """ - BASELINE_POLICY: ClassVar[BaselinePolicy] = BaselinePolicy.Forbidden + BASELINE_ATTACK_POLICY: ClassVar[BaselineAttackPolicy] = BaselineAttackPolicy.Enabled #: Subclasses must declare a scenario version for memory bookkeeping. VERSION: ClassVar[int] @@ -66,36 +64,27 @@ def __init__( self, *, objective_scorer: TrueFalseScorer | None = None, - epsilon: float = 0.2, - pool_threshold: int = 3, - max_attempts_per_objective: int = 3, - seed: int | None = None, context_extractor: ContextExtractor = global_context, + selector: TechniqueSelector | None = None, scenario_result_id: str | None = None, ) -> None: """ Args: objective_scorer (TrueFalseScorer | None): Scorer used to judge each response. Defaults to the composite scorer from the base class. - epsilon (float): Exploration probability for the selector. Defaults to 0.2. - pool_threshold (int): Minimum per-(context, technique) attempts before - the local estimate overrides the pooled rate. Set to 1 to disable - pooling. Defaults to 3. - max_attempts_per_objective (int): Max techniques per objective. Defaults to 3. - seed (int | None): RNG seed for deterministic selection. Defaults to ``None``. context_extractor (ContextExtractor): Maps a ``SeedAttackGroup`` to a context key. Defaults to ``global_context``. + selector (TechniqueSelector | None): Pre-built selector. When ``None`` + (default) an :class:`EpsilonGreedyTechniqueSelector` is created + with default settings. scenario_result_id (str | None): ID of an existing ``ScenarioResult`` to resume. """ if not objective_scorer: objective_scorer = self._get_default_objective_scorer() self._objective_scorer: TrueFalseScorer = objective_scorer - self._epsilon = epsilon - self._pool_threshold = pool_threshold - self._max_attempts_per_objective = max_attempts_per_objective - self._seed = seed self._context_extractor = context_extractor + self._custom_selector = selector super().__init__( version=self.VERSION, @@ -106,18 +95,24 @@ def __init__( async def _get_atomic_attacks_async(self) -> list[AtomicAttack]: """ - Build one ``AtomicAttack`` per objective. + Build one ``AtomicAttack`` per dataset, each carrying every objective + in that dataset as a separate ``SeedAttackGroup``. - Each objective gets a freshly constructed ``AdaptiveDispatchAttack`` - bound to its seed group, but all dispatchers share the same selector - so learning accumulates across objectives. Per-objective, techniques - whose ``seed_technique`` is incompatible with the seed group are - filtered out; objectives left with no compatible techniques are skipped. + A single ``AdaptiveDispatchAttack`` is constructed per dataset and + shared across its seed groups; per-call seed-group routing and + per-call ``seed_technique`` compatibility filtering happen inside the + dispatcher (driven by ``AdaptiveDispatchParams.seed_group``). All + dispatchers across all datasets share one ``TechniqueSelector`` + instance so learning accumulates globally. + + Seed groups whose objective is incompatible with every technique are + dropped up-front with a warning so the dispatcher never sees an empty + compatible pool at run time. Returns: - list[AtomicAttack]: One ``AtomicAttack`` per objective with at - least one compatible technique. Empty if every seed group - is incompatible with every selected technique. + list[AtomicAttack]: One ``AtomicAttack`` per dataset that has at + least one compatible seed group. Empty if every seed group is + incompatible with every selected technique. Raises: ValueError: If ``self._objective_target`` is not set, or if @@ -128,26 +123,25 @@ async def _get_atomic_attacks_async(self) -> list[AtomicAttack]: techniques = self._build_techniques_dict(objective_target=self._objective_target) - selector = AdaptiveTechniqueSelector( - epsilon=self._epsilon, - pool_threshold=self._pool_threshold, - rng=random.Random(self._seed), - ) + selector: TechniqueSelector + if self._custom_selector is not None: + selector = self._custom_selector + else: + selector = EpsilonGreedyTechniqueSelector() # On resume, replay prior attempt outcomes from persisted metadata. self._rehydrate_selector_from_memory(selector=selector, known_techniques=set(techniques)) seed_groups_by_dataset = self._dataset_config.get_seed_attack_groups() atomic_attacks: list[AtomicAttack] = [] for dataset_name, seed_groups in seed_groups_by_dataset.items(): - for seed_group in seed_groups: - atomic = self._build_atomic_for_seed_group( - dataset_name=dataset_name, - seed_group=seed_group, - techniques=techniques, - selector=selector, - ) - if atomic is not None: - atomic_attacks.append(atomic) + atomic = self._build_atomic_for_dataset( + dataset_name=dataset_name, + seed_groups=seed_groups, + techniques=techniques, + selector=selector, + ) + if atomic is not None: + atomic_attacks.append(atomic) return atomic_attacks @@ -202,24 +196,25 @@ def _build_techniques_dict( return techniques - def _build_atomic_for_seed_group( + def _build_atomic_for_dataset( self, *, dataset_name: str, - seed_group: SeedAttackGroup, + seed_groups: list[SeedAttackGroup], techniques: dict[str, TechniqueBundle], - selector: AdaptiveTechniqueSelector, + selector: TechniqueSelector, ) -> AtomicAttack | None: """ - Build a single ``AtomicAttack`` for one ``SeedAttackGroup``. + Build a single ``AtomicAttack`` for one dataset with all compatible + seed groups attached. - Filters the technique pool down to those whose ``seed_technique`` (if - any) is compatible with this seed group, then constructs a dedicated - ``AdaptiveDispatchAttack`` bound to this seed group. + Seed groups for which no technique in the pool is compatible are + dropped here with a warning so the dispatcher's per-call compatible + pool is guaranteed non-empty. Returns: AtomicAttack | None: The constructed atomic attack, or ``None`` when - no techniques are compatible (caller skips the objective). + every seed group is incompatible with every technique. Raises: ValueError: If ``self._objective_target`` is not set (defensive @@ -228,64 +223,62 @@ def _build_atomic_for_seed_group( if self._objective_target is None: # pragma: no cover - defensive raise ValueError("objective_target must be set before creating attacks") - compatible: dict[str, TechniqueBundle] = { - name: bundle - for name, bundle in techniques.items() - if bundle.seed_technique is None or seed_group.is_compatible_with_technique(technique=bundle.seed_technique) - } - - if not compatible: - logger.warning( - "AdaptiveScenario: no compatible techniques for seed group in dataset '%s' (objective=%r); skipping.", - dataset_name, - seed_group.objective.value, + compatible_seed_groups: list[SeedAttackGroup] = [] + for seed_group in seed_groups: + has_compatible = any( + bundle.seed_technique is None + or seed_group.is_compatible_with_technique(technique=bundle.seed_technique) + for bundle in techniques.values() ) - return None + if has_compatible: + compatible_seed_groups.append(seed_group) + else: + logger.warning( + "AdaptiveScenario: no compatible techniques for seed group in dataset '%s' " + "(objective=%r); skipping.", + dataset_name, + seed_group.objective.value, + ) - adaptive_context = self._context_extractor(seed_group) - # Prefer the objective's id when available so resume keys stay stable - # across re-fetches of the same seed groups. - objective_id = seed_group.objective.id if seed_group.objective.id else uuid.uuid4() - atomic_attack_name = f"{self._atomic_attack_prefix}_{dataset_name}_{objective_id}" + if not compatible_seed_groups: + return None dispatcher = AdaptiveDispatchAttack( objective_target=self._objective_target, - techniques=compatible, + techniques=techniques, selector=selector, - seed_group=seed_group, + context_extractor=self._context_extractor, objective_scorer=self._objective_scorer, - max_attempts_per_objective=self._max_attempts_per_objective, + max_attempts_per_objective=self.params["max_attempts_per_objective"], ) - memory_labels = { - **self._memory_labels, - ADAPTIVE_CONTEXT_LABEL: adaptive_context, - } return AtomicAttack( - atomic_attack_name=atomic_attack_name, + atomic_attack_name=f"{self._atomic_attack_prefix}_{dataset_name}", attack_technique=AttackTechnique(attack=dispatcher), - seed_groups=[seed_group], + seed_groups=compatible_seed_groups, objective_scorer=self._objective_scorer, - memory_labels=memory_labels, + memory_labels=dict(self._memory_labels), display_group=dataset_name, ) def _rehydrate_selector_from_memory( self, *, - selector: AdaptiveTechniqueSelector, + selector: TechniqueSelector, known_techniques: set[str], ) -> None: """ Replay persisted dispatch trails into ``selector`` so resume preserves learned state. - Iterates every persisted ``AttackResult`` on the resumed - ``ScenarioResult`` and calls ``record_outcome`` once per attempt in - each ``metadata["adaptive_attempts"]`` trail. + Queries ``AttackResultEntry`` rows directly by ``scenario_result_id`` + (which selects on ``attribution_parent_id`` stamped at write time by + ``AtomicAttack``'s attribution path) and filters to rows belonging to + this scenario's adaptive atomic attacks via + ``attribution_data["parent_collection"]``. Args: - selector (AdaptiveTechniqueSelector): A freshly built selector to populate. + selector (TechniqueSelector): A freshly built selector to populate. known_techniques (set[str]): Techniques available in the current run. Trails referencing unknown techniques (e.g. after a strategies change) are skipped so replay can't poison the table. @@ -296,32 +289,35 @@ def _rehydrate_selector_from_memory( # Narrow to errors a memory backend would plausibly raise (DB/IO # failures, integrity issues). Programmer-level errors propagate. try: - scenario_results = self._memory.get_scenario_results(scenario_result_ids=[self._scenario_result_id]) + rows = self._memory.get_attack_results(scenario_result_id=self._scenario_result_id) except (RuntimeError, OSError, ValueError) as exc: - logger.warning(f"AdaptiveScenario: failed to load prior scenario result for rehydration: {exc}") - return - - if not scenario_results: + logger.warning(f"AdaptiveScenario: failed to load prior attack results for rehydration: {exc}") return + adaptive_prefix = f"{self._atomic_attack_prefix}_" replayed = 0 - for results_list in scenario_results[0].attack_results.values(): - for result in results_list: - trail = result.metadata.get("adaptive_attempts") if result.metadata else None - context = result.metadata.get("adaptive_context") if result.metadata else None - if not trail or not context: + for result in rows: + if result.attribution_data is None: + continue + collection = result.attribution_data.get("parent_collection") + if not collection or not collection.startswith(adaptive_prefix): + continue + metadata = result.metadata or {} + trail = metadata.get("adaptive_attempts") + context = metadata.get("adaptive_context") + if not trail or not context: + continue + for step in trail: + technique = step.get("technique") + outcome = step.get("outcome") + if not technique or technique not in known_techniques: continue - for step in trail: - technique = step.get("technique") - outcome = step.get("outcome") - if not technique or technique not in known_techniques: - continue - selector.record_outcome( - context=context, - technique=technique, - success=outcome == "success", - ) - replayed += 1 + selector.record_outcome( + context=context, + technique=technique, + success=outcome == "success", + ) + replayed += 1 if replayed: logger.info(f"AdaptiveScenario: rehydrated selector with {replayed} prior attempt(s).") diff --git a/pyrit/scenario/scenarios/adaptive/dispatcher.py b/pyrit/scenario/scenarios/adaptive/dispatcher.py index 46808bfde..a8e671cdd 100644 --- a/pyrit/scenario/scenarios/adaptive/dispatcher.py +++ b/pyrit/scenario/scenarios/adaptive/dispatcher.py @@ -2,35 +2,40 @@ # Licensed under the MIT license. """ -``AdaptiveDispatchAttack`` — picks an inner technique per attempt via an -``AdaptiveTechniqueSelector``, runs it, records the outcome, and loops up to -``max_attempts_per_objective`` times. Reads the per-objective context key from -``context.memory_labels[ADAPTIVE_CONTEXT_LABEL]`` (falls back to the global context). - -The dispatcher is bound to a single ``SeedAttackGroup`` at construction time so -it can merge each chosen technique's ``seed_technique`` (when present) into the -seed group before delegating execution to ``AttackExecutor``. +``AdaptiveDispatchAttack`` — picks an inner technique per attempt via a +``TechniqueSelector``, runs it, records the outcome, and loops up to +``max_attempts_per_objective`` times. + +The dispatcher is shared across all seed groups in an enclosing +``AtomicAttack`` and reads the per-call ``SeedAttackGroup`` from +``AdaptiveDispatchParams.seed_group`` (populated by +``AdaptiveDispatchParams.from_seed_group_async``). It computes the per-call +adaptive context key via the injected ``ContextExtractor`` and merges each +chosen technique's ``seed_technique`` (when present) into the seed group +before delegating execution to ``AttackExecutor``. """ from __future__ import annotations +import dataclasses import logging import uuid -from dataclasses import dataclass, replace +from dataclasses import dataclass, field, replace from datetime import datetime, timezone -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Optional from pyrit.executor.attack.core.attack_executor import AttackExecutor from pyrit.executor.attack.core.attack_parameters import AttackParameters from pyrit.executor.attack.core.attack_strategy import AttackContext, AttackStrategy -from pyrit.models import AttackOutcome, AttackResult -from pyrit.scenario.scenarios.adaptive.selector import ( - GLOBAL_CONTEXT, - AdaptiveTechniqueSelector, +from pyrit.models import AttackOutcome, AttackResult, SeedAttackGroup +from pyrit.scenario.scenarios.adaptive.selectors import ( + ContextExtractor, + TechniqueSelector, + global_context, ) if TYPE_CHECKING: - from pyrit.models import SeedAttackGroup, SeedAttackTechniqueGroup + from pyrit.models import SeedAttackTechniqueGroup from pyrit.prompt_target import PromptTarget from pyrit.score import TrueFalseScorer @@ -38,8 +43,8 @@ # Memory-label keys stamped onto persisted prompt rows so adaptive attempts -# can be filtered/grouped after a run. The scenario stamps the context once -# per objective; the dispatcher stamps technique + attempt index on each try. +# can be filtered/grouped after a run. The dispatcher stamps all three on +# each attempt (context derived per-call from the seed group). ADAPTIVE_CONTEXT_LABEL: str = "_adaptive_context" """Per-objective context key (e.g. ``"_global"`` or a harm category).""" ADAPTIVE_TECHNIQUE_LABEL: str = "_adaptive_technique" @@ -63,25 +68,75 @@ class TechniqueBundle: adversarial_chat: PromptTarget | None = None +@dataclass(frozen=True) +class AdaptiveDispatchParams(AttackParameters): + # The original SeedAttackGroup is preserved on the params so the + # dispatcher can apply per-attempt seed_technique merging and derive + # the per-call adaptive context. Captured by ``from_seed_group_async``; + # not user-supplied via overrides. + seed_group: Optional[SeedAttackGroup] = field(default=None, repr=False, compare=False) + + @classmethod + async def from_seed_group_async( + cls, + *, + seed_group: SeedAttackGroup, + adversarial_chat: Optional["PromptTarget"] = None, # noqa: ARG003 — required by base class signature + objective_scorer: Optional["TrueFalseScorer"] = None, # noqa: ARG003 — required by base class signature + **overrides: Any, + ) -> "AdaptiveDispatchParams": + """ + Build params for a single dispatch and capture the original seed_group. + + The dispatcher applies seed_technique merging itself per-attempt, so + we deliberately bypass the base class's simulated-conversation + expansion / next_message extraction: the inner technique runs through + its own ``execute_attack_from_seed_groups_async`` call which performs + that work using the technique-merged seed_group. + """ + if seed_group.objective is None: + raise ValueError("seed_group.objective is not initialized") + seed_group.validate() + + valid_fields = {f.name for f in dataclasses.fields(cls)} - {"seed_group"} + invalid = set(overrides.keys()) - valid_fields + if invalid: + raise ValueError( + f"{cls.__name__} does not accept parameters: {invalid}. Accepted: {valid_fields}" + ) + + return cls( + objective=seed_group.objective.value, + memory_labels=overrides.get("memory_labels") or {}, + seed_group=seed_group, + ) + + @dataclass -class AdaptiveDispatchContext(AttackContext[AttackParameters]): +class AdaptiveDispatchContext(AttackContext[AdaptiveDispatchParams]): """Execution context for ``AdaptiveDispatchAttack`` (no extra state).""" class AdaptiveDispatchAttack(AttackStrategy[AdaptiveDispatchContext, AttackResult]): """ Attack that delegates each attempt to one of several inner techniques, - choosing per attempt via an ``AdaptiveTechniqueSelector``. + choosing per attempt via a ``TechniqueSelector``. For each objective, loops up to ``max_attempts_per_objective`` times: - ask the selector, execute the chosen technique, record the outcome, and - stop early on success. The selector is shared by reference with the - scenario so learning accumulates across objectives. - - The dispatcher is bound to a single ``SeedAttackGroup`` at construction - time. When a chosen technique declares a ``seed_technique``, that group - is merged into the seed group before execution (mirroring the static - ``AtomicAttack`` path). + ask the selector, execute the chosen technique against the current seed + group, record the outcome, and stop early on success. The selector is + shared by reference across all dispatch calls in a scenario so learning + accumulates across objectives. + + The seed group for a given dispatch is read from + ``context.params.seed_group`` (captured by + ``AdaptiveDispatchParams.from_seed_group_async``). When a chosen + technique declares a ``seed_technique``, that group is merged into the + seed group before execution (mirroring the static ``AtomicAttack`` path). + Techniques whose ``seed_technique`` is incompatible with the current + seed group are filtered out of the candidate pool for that call; if the + pool is empty the dispatcher raises so the per-call seed group is dropped + by the executor's partial-failure path rather than silently no-op'ing. On success, the dispatcher returns a fresh ``AttackResult`` copy of the winning inner result (new ``attack_result_id`` and ``timestamp``) with @@ -97,8 +152,8 @@ def __init__( *, objective_target: PromptTarget, techniques: dict[str, TechniqueBundle], - selector: AdaptiveTechniqueSelector, - seed_group: SeedAttackGroup, + selector: TechniqueSelector, + context_extractor: ContextExtractor = global_context, objective_scorer: TrueFalseScorer | None = None, max_attempts_per_objective: int = 3, ) -> None: @@ -108,10 +163,9 @@ def __init__( Stored for identifier/logging parity; not called directly. techniques (dict[str, TechniqueBundle]): Mapping from technique name to its bundle (attack, seed_technique, adversarial_chat). Must be non-empty. - selector (AdaptiveTechniqueSelector): Shared selector state. - seed_group (SeedAttackGroup): The seed group bound to this dispatcher. - Each attempt's chosen technique is applied against this group - (merging the technique's ``seed_technique`` when present). + selector (TechniqueSelector): Shared selector state. + context_extractor (ContextExtractor): Maps a per-call ``SeedAttackGroup`` to + the adaptive context key used by the selector. Defaults to ``global_context``. objective_scorer (TrueFalseScorer | None): Scorer passed through to techniques that generate simulated conversations. max_attempts_per_objective (int): Max attempts per objective; >= 1. @@ -128,12 +182,12 @@ def __init__( super().__init__( objective_target=objective_target, context_type=AdaptiveDispatchContext, - params_type=AttackParameters, + params_type=AdaptiveDispatchParams, logger=logger, ) self._techniques = techniques self._selector = selector - self._seed_group = seed_group + self._context_extractor = context_extractor self._objective_scorer = objective_scorer self._max_attempts = max_attempts_per_objective # Attempts are inherently sequential (each one reads the selector @@ -161,17 +215,19 @@ async def _run_inner_attack_async( self, *, bundle: TechniqueBundle, + seed_group: SeedAttackGroup, attempt_labels: dict[str, str], ) -> AttackResult: """ - Execute the chosen technique against this dispatcher's seed group. + Execute the chosen technique against the per-call seed group. - Merges ``bundle.seed_technique`` into the bound ``seed_group`` (when - present) and delegates execution to ``AttackExecutor``. Isolated as a - method so tests can patch the inner-attack call surface. + Merges ``bundle.seed_technique`` into ``seed_group`` (when present) + and delegates execution to ``AttackExecutor``. Isolated as a method + so tests can patch the inner-attack call surface. Args: bundle (TechniqueBundle): The chosen technique's attack + seeds + chat. + seed_group (SeedAttackGroup): The seed group for this dispatch call. attempt_labels (dict[str, str]): Memory labels stamped onto this attempt. Returns: @@ -182,9 +238,9 @@ async def _run_inner_attack_async( propagated exception (should be unreachable). """ if bundle.seed_technique is not None: - execution_group = self._seed_group.with_technique(technique=bundle.seed_technique) + execution_group = seed_group.with_technique(technique=bundle.seed_technique) else: - execution_group = self._seed_group + execution_group = seed_group executor_result = await self._executor.execute_attack_from_seed_groups_async( attack=bundle.attack, @@ -206,14 +262,16 @@ async def _perform_async(self, *, context: AdaptiveDispatchContext) -> AttackRes """ Run the per-objective adaptive loop. - Resolves the per-objective context key from ``context.memory_labels`` - (falling back to :data:`GLOBAL_CONTEXT`), then loops up to + Reads the per-call ``SeedAttackGroup`` from ``context.params.seed_group``, + derives the adaptive context key via the injected ``ContextExtractor``, + and filters the technique pool to those whose ``seed_technique`` is + compatible with this seed group. Then loops up to ``max_attempts_per_objective`` times: select a technique, execute it, record the outcome, and stop early on success. Args: - context (AdaptiveDispatchContext): Execution context. ``memory_labels`` - may carry :data:`ADAPTIVE_CONTEXT_LABEL` to scope the selector. + context (AdaptiveDispatchContext): Execution context whose + ``params.seed_group`` carries the seed group for this call. Returns: AttackResult: A fresh dispatcher-owned copy of the final inner @@ -221,20 +279,46 @@ async def _perform_async(self, *, context: AdaptiveDispatchContext) -> AttackRes (see class docstring for the two-row persistence note). Raises: + ValueError: If ``context.params.seed_group`` is missing, or if no + techniques in the pool are compatible with the seed group. RuntimeError: If the loop somehow ran zero attempts (unreachable because ``max_attempts_per_objective`` is validated >= 1). """ - adaptive_context = context.memory_labels.get(ADAPTIVE_CONTEXT_LABEL, GLOBAL_CONTEXT) - technique_names = list(self._techniques.keys()) + seed_group = context.params.seed_group + if seed_group is None: + raise ValueError( + "AdaptiveDispatchAttack requires AdaptiveDispatchParams.seed_group; " + "build params via AdaptiveDispatchParams.from_seed_group_async." + ) + + compatible_names = [ + name + for name, bundle in self._techniques.items() + if bundle.seed_technique is None + or seed_group.is_compatible_with_technique(technique=bundle.seed_technique) + ] + if not compatible_names: + raise ValueError( + f"AdaptiveDispatchAttack: no compatible techniques for seed group " + f"(objective={seed_group.objective.value!r})." + ) + + adaptive_context = self._context_extractor(seed_group) last_result: AttackResult | None = None trail: list[dict[str, str]] = [] for attempt_idx in range(self._max_attempts): - chosen = self._selector.select(context=adaptive_context, techniques=technique_names) + decision_key = f"{context.objective}:{attempt_idx}" + chosen = self._selector.select( + context=adaptive_context, + techniques=compatible_names, + decision_key=decision_key, + ) bundle = self._techniques[chosen] attempt_labels = { **context.memory_labels, + ADAPTIVE_CONTEXT_LABEL: adaptive_context, ADAPTIVE_TECHNIQUE_LABEL: chosen, ADAPTIVE_ATTEMPT_LABEL: str(attempt_idx + 1), } @@ -247,7 +331,9 @@ async def _perform_async(self, *, context: AdaptiveDispatchContext) -> AttackRes chosen, ) - result = await self._run_inner_attack_async(bundle=bundle, attempt_labels=attempt_labels) + result = await self._run_inner_attack_async( + bundle=bundle, seed_group=seed_group, attempt_labels=attempt_labels + ) success = result.outcome == AttackOutcome.SUCCESS self._selector.record_outcome(context=adaptive_context, technique=chosen, success=success) diff --git a/pyrit/scenario/scenarios/adaptive/selectors/__init__.py b/pyrit/scenario/scenarios/adaptive/selectors/__init__.py new file mode 100644 index 000000000..7e97f1940 --- /dev/null +++ b/pyrit/scenario/scenarios/adaptive/selectors/__init__.py @@ -0,0 +1,26 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Selector protocol, context extractors, and selector implementations.""" + +from pyrit.scenario.scenarios.adaptive.selectors.epsilon_greedy import ( + EpsilonGreedyTechniqueSelector, +) +from pyrit.scenario.scenarios.adaptive.selectors.protocol import ( + GLOBAL_CONTEXT, + UNCATEGORIZED_CONTEXT, + ContextExtractor, + TechniqueSelector, + global_context, + harm_category_context, +) + +__all__ = [ + "ContextExtractor", + "EpsilonGreedyTechniqueSelector", + "GLOBAL_CONTEXT", + "TechniqueSelector", + "UNCATEGORIZED_CONTEXT", + "global_context", + "harm_category_context", +] diff --git a/pyrit/scenario/scenarios/adaptive/selector.py b/pyrit/scenario/scenarios/adaptive/selectors/epsilon_greedy.py similarity index 71% rename from pyrit/scenario/scenarios/adaptive/selector.py rename to pyrit/scenario/scenarios/adaptive/selectors/epsilon_greedy.py index d2d9e63a7..ec6152097 100644 --- a/pyrit/scenario/scenarios/adaptive/selector.py +++ b/pyrit/scenario/scenarios/adaptive/selectors/epsilon_greedy.py @@ -1,53 +1,32 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -"""Epsilon-greedy selector and context extractors for adaptive scenarios.""" +"""Epsilon-greedy technique selector for adaptive scenarios.""" from __future__ import annotations +import hashlib import random +import struct import threading -from collections.abc import Callable, Sequence -from typing import TYPE_CHECKING +from collections.abc import Sequence -if TYPE_CHECKING: - from pyrit.models.seeds.seed_attack_group import SeedAttackGroup -"""Maps a ``SeedAttackGroup`` to an adaptive context key.""" -ContextExtractor = Callable[["SeedAttackGroup"], str] -"""Default context: all objectives share one selection table.""" -GLOBAL_CONTEXT: str = "_global" -"""Fallback context for seed groups with no harm category metadata.""" -UNCATEGORIZED_CONTEXT: str = "_uncategorized" - - -def global_context(_seed_attack_group: SeedAttackGroup) -> str: - """ - Return a single shared context for all objectives. - - Returns: - str: Always :data:`GLOBAL_CONTEXT`. +def _derive_rng(random_seed: int | None, context: str, decision_key: str) -> random.Random: """ - return GLOBAL_CONTEXT - + Derive a per-decision ``Random`` from ``(random_seed, context, decision_key)``. -def harm_category_context(seed_attack_group: SeedAttackGroup) -> str: + Returns a fresh ``random.Random`` seeded deterministically from the + inputs when ``random_seed`` is not None, or an unseeded ``Random`` otherwise. """ - Return a context keyed by the sorted, ``|``-joined harm categories. + if random_seed is None: + return random.Random() + digest = hashlib.sha256(f"{random_seed}|{context}|{decision_key}".encode()).digest() + derived_seed = struct.unpack(" None: """ Args: @@ -80,8 +63,8 @@ def __init__( pool_threshold (int): Minimum per-(context, technique) attempts before the local estimate replaces the pooled rate. Must be >= 1; set to 1 to disable pooling. Defaults to 3. - rng (random.Random | None): RNG for reproducible decisions. Defaults - to a fresh unseeded ``random.Random()``. + random_seed (int | None): Base seed for deterministic per-decision RNG derivation. + Defaults to ``None`` (non-deterministic). Raises: ValueError: If ``epsilon`` is outside [0.0, 1.0] or ``pool_threshold`` < 1. @@ -93,21 +76,27 @@ def __init__( self._epsilon = epsilon self._pool_threshold = pool_threshold - self._rng = rng if rng is not None else random.Random() + self._seed = random_seed self._counts: dict[tuple[str, str], tuple[int, int]] = {} # Per-technique pooled counts, kept in sync with ``_counts`` so the # pooled-backoff branch in ``_estimate`` is O(1). self._global_counts: dict[str, tuple[int, int]] = {} - # Guards _counts, _global_counts, and _rng against concurrent callers. + # Monotonic counter for auto-generating decision keys when the caller + # doesn't provide one. + self._decision_counter: int = 0 + # Guards _counts, _global_counts, and _decision_counter against concurrent callers. self._lock = threading.Lock() - def select(self, *, context: str, techniques: Sequence[str]) -> str: + def select(self, *, context: str, techniques: Sequence[str], decision_key: str = "") -> str: """ Pick the next technique to try for ``context``. Args: context (str): The context key. techniques (Sequence[str]): Candidate technique names. + decision_key (str): Caller-supplied key (e.g. ``"obj_id:attempt_idx"``) + used to derive a per-decision RNG for deterministic replay. + Defaults to ``""`` (auto-incremented counter). Returns: str: The chosen technique name. @@ -120,13 +109,20 @@ def select(self, *, context: str, techniques: Sequence[str]) -> str: raise ValueError("techniques must contain at least one entry") with self._lock: - if self._rng.random() < self._epsilon: - return self._rng.choice(technique_list) + if decision_key: + effective_key = decision_key + else: + effective_key = str(self._decision_counter) + self._decision_counter += 1 + rng = _derive_rng(self._seed, context, effective_key) + + if rng.random() < self._epsilon: + return rng.choice(technique_list) estimates = {t: self._estimate(context=context, technique=t) for t in technique_list} best = max(estimates.values()) winners = [t for t, value in estimates.items() if value >= best - self._TIE_TOL] - return self._rng.choice(winners) + return rng.choice(winners) def record_outcome(self, *, context: str, technique: str, success: bool) -> None: """ @@ -180,3 +176,4 @@ def _estimate(self, *, context: str, technique: str) -> float: return (local_s + 1) / (local_n + 1) global_s, global_n = self._global_counts.get(technique, (0, 0)) return (global_s + 1) / (global_n + 1) + diff --git a/pyrit/scenario/scenarios/adaptive/selectors/protocol.py b/pyrit/scenario/scenarios/adaptive/selectors/protocol.py new file mode 100644 index 000000000..e8c8c640f --- /dev/null +++ b/pyrit/scenario/scenarios/adaptive/selectors/protocol.py @@ -0,0 +1,66 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Selector protocol and context extractors for adaptive scenarios.""" + +from __future__ import annotations + +from collections.abc import Callable, Sequence +from typing import TYPE_CHECKING, Protocol, runtime_checkable + +if TYPE_CHECKING: + from pyrit.models.seeds.seed_attack_group import SeedAttackGroup + +ContextExtractor = Callable[["SeedAttackGroup"], str] +"""Maps a ``SeedAttackGroup`` to an adaptive context key.""" + +GLOBAL_CONTEXT: str = "_global" +"""Default context: all objectives share one selection table.""" + +UNCATEGORIZED_CONTEXT: str = "_uncategorized" +"""Fallback context for seed groups with no harm category metadata.""" + + +def global_context(_seed_attack_group: SeedAttackGroup) -> str: + """ + Return a single shared context for all objectives. + + Returns: + str: Always :data:`GLOBAL_CONTEXT`. + """ + return GLOBAL_CONTEXT + + +def harm_category_context(seed_attack_group: SeedAttackGroup) -> str: + """ + Return a context keyed by the sorted, ``|``-joined harm categories. + + Multi-category seeds form their own bucket; sorting makes the key deterministic. + + Returns: + str: The ``|``-joined sorted harm categories, or :data:`UNCATEGORIZED_CONTEXT` + when the seed group has no categories. + """ + categories = seed_attack_group.harm_categories + if not categories: + return UNCATEGORIZED_CONTEXT + return "|".join(sorted(categories)) + + +@runtime_checkable +class TechniqueSelector(Protocol): + """ + Protocol for adaptive technique selectors. + + Any object implementing ``select`` and ``record_outcome`` can serve as + the selector for an ``AdaptiveScenario``. The epsilon-greedy + implementation (:class:`EpsilonGreedyTechniqueSelector`) is the default. + """ + + def select(self, *, context: str, techniques: Sequence[str], decision_key: str = "") -> str: + """Pick the next technique to try for ``context``.""" + ... # pragma: no cover + + def record_outcome(self, *, context: str, technique: str, success: bool) -> None: + """Record the outcome of an attempt.""" + ... # pragma: no cover diff --git a/pyrit/scenario/scenarios/adaptive/text_adaptive.py b/pyrit/scenario/scenarios/adaptive/text_adaptive.py index 4bbbe7ff4..c1d1e588a 100644 --- a/pyrit/scenario/scenarios/adaptive/text_adaptive.py +++ b/pyrit/scenario/scenarios/adaptive/text_adaptive.py @@ -6,9 +6,9 @@ Picks attack techniques per-objective using an epsilon-greedy selector informed by observed success rates. Runs up to ``max_attempts_per_objective`` -techniques per objective and stops early on success. The available techniques -come from the selected scenario strategies (``--strategies single_turn`` -restricts to single-turn techniques, etc.). +techniques per objective and stops early on success. ``prompt_sending`` is +excluded from the adaptive technique pool and runs as the baseline comparison +instead. """ from __future__ import annotations @@ -17,11 +17,13 @@ from typing import TYPE_CHECKING, ClassVar from pyrit.common import apply_defaults +from pyrit.common.parameter import Parameter from pyrit.registry.tag_query import TagQuery from pyrit.scenario.core.dataset_configuration import DatasetConfiguration from pyrit.scenario.scenarios.adaptive.adaptive_scenario import AdaptiveScenario -from pyrit.scenario.scenarios.adaptive.selector import ( +from pyrit.scenario.scenarios.adaptive.selectors import ( ContextExtractor, + TechniqueSelector, global_context, ) @@ -31,10 +33,15 @@ logger = logging.getLogger(__name__) +# Techniques excluded from the adaptive technique pool. These run as the +# baseline comparison rather than as adversarial moves the selector chooses. +_EXCLUDED_TECHNIQUES = frozenset({"prompt_sending"}) + def _build_text_adaptive_strategy() -> type[ScenarioStrategy]: """ - Build the strategy enum from the core scenario-techniques catalog. + Build the strategy enum from the core scenario-techniques catalog, + excluding techniques that run as baseline. Returns: type[ScenarioStrategy]: The dynamically-built strategy enum class. @@ -44,9 +51,11 @@ def _build_text_adaptive_strategy() -> type[ScenarioStrategy]: ) from pyrit.scenario.core.scenario_techniques import SCENARIO_TECHNIQUES + filtered_specs = [spec for spec in SCENARIO_TECHNIQUES if spec.name not in _EXCLUDED_TECHNIQUES] + return AttackTechniqueRegistry.build_strategy_class_from_specs( # type: ignore[return-value, ty:invalid-return-type] class_name="TextAdaptiveStrategy", - specs=SCENARIO_TECHNIQUES, + specs=filtered_specs, aggregate_tags={ "default": TagQuery.any_of("default"), "single_turn": TagQuery.any_of("single_turn"), @@ -60,8 +69,8 @@ class TextAdaptive(AdaptiveScenario): Adaptive text-attack scenario. Selects techniques per-objective via an epsilon-greedy selector over the - set of selected strategies. ``prompt_sending`` participates as one of the - selector's techniques rather than being prepended as a baseline. + set of selected strategies. ``prompt_sending`` runs as the baseline + comparison and is excluded from the adaptive technique pool. """ VERSION: int = 1 @@ -99,39 +108,48 @@ def default_dataset_config(cls) -> DatasetConfiguration: """Return the default :class:`DatasetConfiguration` (required datasets, capped at 4 per dataset).""" return DatasetConfiguration(dataset_names=cls.required_datasets(), max_dataset_size=4) + @classmethod + def supported_parameters(cls) -> list[Parameter]: + """ + Declare custom parameters this scenario accepts from the CLI / config file. + + Returns: + list[Parameter]: Parameters configurable per-run. + """ + return [ + Parameter( + name="max_attempts_per_objective", + description="Max techniques tried per objective.", + param_type=int, + default=3, + ), + ] + @apply_defaults def __init__( self, *, objective_scorer: TrueFalseScorer | None = None, - epsilon: float = 0.2, - pool_threshold: int = 3, - max_attempts_per_objective: int = 3, - seed: int | None = None, context_extractor: ContextExtractor = global_context, + selector: TechniqueSelector | None = None, scenario_result_id: str | None = None, ) -> None: """ Args: objective_scorer (TrueFalseScorer | None): Scorer used to judge each response. Defaults to the composite scorer from the base class. - epsilon (float): Exploration probability for the selector. Defaults to 0.2. - pool_threshold (int): Minimum per-(context, technique) attempts before - the local estimate overrides the pooled rate. Set to 1 to disable - pooling. Defaults to 3. - max_attempts_per_objective (int): Max techniques per objective. Defaults to 3. - seed (int | None): RNG seed for deterministic selection. Defaults to ``None``. context_extractor (ContextExtractor): Maps a ``SeedAttackGroup`` to a context key. Defaults to ``global_context``. Use ``harm_category_context`` to partition by harm category. + selector (TechniqueSelector | None): Pre-built selector. When ``None`` + (default) an :class:`EpsilonGreedyTechniqueSelector` is created + with default settings. Pass a custom instance to tune + ``epsilon``, ``pool_threshold``, or ``random_seed``. scenario_result_id (str | None): ID of an existing ``ScenarioResult`` to resume. """ super().__init__( objective_scorer=objective_scorer, - epsilon=epsilon, - pool_threshold=pool_threshold, - max_attempts_per_objective=max_attempts_per_objective, - seed=seed, context_extractor=context_extractor, + selector=selector, scenario_result_id=scenario_result_id, ) diff --git a/tests/unit/scenario/scenarios/adaptive/test_dispatcher.py b/tests/unit/scenario/scenarios/adaptive/test_dispatcher.py index 4be4ffbb6..963d6bd63 100644 --- a/tests/unit/scenario/scenarios/adaptive/test_dispatcher.py +++ b/tests/unit/scenario/scenarios/adaptive/test_dispatcher.py @@ -1,24 +1,23 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -import random from unittest.mock import AsyncMock, MagicMock import pytest -from pyrit.executor.attack.core.attack_parameters import AttackParameters from pyrit.models import AttackOutcome, AttackResult, SeedAttackGroup, SeedObjective from pyrit.scenario.scenarios.adaptive.dispatcher import ( ADAPTIVE_ATTEMPT_LABEL, - ADAPTIVE_CONTEXT_LABEL, ADAPTIVE_TECHNIQUE_LABEL, AdaptiveDispatchAttack, AdaptiveDispatchContext, + AdaptiveDispatchParams, TechniqueBundle, ) -from pyrit.scenario.scenarios.adaptive.selector import ( +from pyrit.scenario.scenarios.adaptive.selectors import ( GLOBAL_CONTEXT, - AdaptiveTechniqueSelector, + EpsilonGreedyTechniqueSelector, + harm_category_context, ) @@ -34,8 +33,22 @@ def _make_bundle(*, name: str, outcomes: list[AttackOutcome], seed_technique=Non return TechniqueBundle(attack=attack, seed_technique=seed_technique) -def _make_context(*, objective: str = "obj", labels: dict[str, str] | None = None) -> AdaptiveDispatchContext: - return AdaptiveDispatchContext(params=AttackParameters(objective=objective, memory_labels=labels or {})) +def _make_context( + *, + objective: str = "obj", + labels: dict[str, str] | None = None, + seed_group: SeedAttackGroup | None = None, + harm_categories: list[str] | None = None, +) -> AdaptiveDispatchContext: + if seed_group is None: + seed_group = SeedAttackGroup(seeds=[SeedObjective(value=objective, harm_categories=harm_categories)]) + return AdaptiveDispatchContext( + params=AdaptiveDispatchParams( + objective=objective, + memory_labels=labels or {}, + seed_group=seed_group, + ) + ) def _patch_inner( @@ -52,7 +65,7 @@ def _patch_inner( name_for_attack = {id(b.attack): name for name, b in bundles.items()} counters: dict[str, int] = dict.fromkeys(bundles, 0) - async def _stub(*, bundle: TechniqueBundle, attempt_labels: dict[str, str]) -> AttackResult: + async def _stub(*, bundle: TechniqueBundle, seed_group, attempt_labels: dict[str, str]) -> AttackResult: name = name_for_attack[id(bundle.attack)] idx = counters[name] counters[name] = idx + 1 @@ -69,9 +82,9 @@ async def _stub(*, bundle: TechniqueBundle, attempt_labels: dict[str, str]) -> A @pytest.fixture -def selector() -> AdaptiveTechniqueSelector: +def selector() -> EpsilonGreedyTechniqueSelector: # epsilon=0 makes selection deterministic given the table. - return AdaptiveTechniqueSelector(epsilon=0.0, pool_threshold=1, rng=random.Random(0)) + return EpsilonGreedyTechniqueSelector(epsilon=0.0, pool_threshold=1, random_seed=0) @pytest.fixture @@ -92,7 +105,6 @@ def test_init_rejects_empty_techniques(self, target, selector, seed_group): objective_target=target, techniques={}, selector=selector, - seed_group=seed_group, ) @pytest.mark.parametrize("bad_max", [0, -1]) @@ -103,7 +115,6 @@ def test_init_rejects_invalid_max_attempts(self, target, selector, seed_group, b objective_target=target, techniques={"a": _make_bundle(name="a", outcomes=[AttackOutcome.SUCCESS])}, selector=selector, - seed_group=seed_group, max_attempts_per_objective=bad_max, ) @@ -119,7 +130,6 @@ async def test_stops_on_first_success(self, target, selector, seed_group): objective_target=target, techniques=bundles, selector=selector, - seed_group=seed_group, max_attempts_per_objective=5, ) inner = _patch_inner(dispatcher=dispatcher, bundles=bundles) @@ -138,7 +148,6 @@ async def test_retries_until_max_attempts_on_failure(self, target, selector, see objective_target=target, techniques=bundles, selector=selector, - seed_group=seed_group, max_attempts_per_objective=3, ) inner = _patch_inner(dispatcher=dispatcher, bundles=bundles) @@ -157,7 +166,6 @@ async def test_updates_selector_on_each_attempt(self, target, selector, seed_gro objective_target=target, techniques=bundles, selector=selector, - seed_group=seed_group, max_attempts_per_objective=3, ) inner = _patch_inner(dispatcher=dispatcher, bundles=bundles) @@ -173,7 +181,6 @@ async def test_passes_attempt_labels_to_inner(self, target, selector, seed_group objective_target=target, techniques=bundles, selector=selector, - seed_group=seed_group, ) inner = _patch_inner(dispatcher=dispatcher, bundles=bundles) @@ -184,7 +191,7 @@ async def test_passes_attempt_labels_to_inner(self, target, selector, seed_group assert labels[ADAPTIVE_TECHNIQUE_LABEL] == "a" assert labels[ADAPTIVE_ATTEMPT_LABEL] == "1" - async def test_uses_adaptive_context_from_label(self, target, selector, seed_group): + async def test_uses_adaptive_context_from_extractor(self, target, selector, seed_group): # Two techniques; one has been heavily rewarded under context "violence" only. bundles = { "a": _make_bundle(name="a", outcomes=[AttackOutcome.SUCCESS]), @@ -199,23 +206,22 @@ async def test_uses_adaptive_context_from_label(self, target, selector, seed_gro objective_target=target, techniques=bundles, selector=selector, - seed_group=seed_group, + context_extractor=harm_category_context, ) inner = _patch_inner(dispatcher=dispatcher, bundles=bundles) - ctx = _make_context(labels={ADAPTIVE_CONTEXT_LABEL: "violence"}) + ctx = _make_context(harm_categories=["violence"]) await dispatcher._perform_async(context=ctx) # Exploit should have picked "b" first. chosen_bundle = inner.call_args.kwargs["bundle"] assert chosen_bundle is bundles["b"] - async def test_falls_back_to_global_context_when_label_missing(self, target, selector, seed_group): + async def test_falls_back_to_global_context_with_default_extractor(self, target, selector, seed_group): bundles = {"a": _make_bundle(name="a", outcomes=[AttackOutcome.SUCCESS])} dispatcher = AdaptiveDispatchAttack( objective_target=target, techniques=bundles, selector=selector, - seed_group=seed_group, ) _patch_inner(dispatcher=dispatcher, bundles=bundles) await dispatcher._perform_async(context=_make_context(labels={})) @@ -229,7 +235,6 @@ async def test_metadata_records_adaptive_trail(self, target, selector, seed_grou objective_target=target, techniques=bundles, selector=selector, - seed_group=seed_group, max_attempts_per_objective=3, ) _patch_inner(dispatcher=dispatcher, bundles=bundles) @@ -243,22 +248,15 @@ async def test_metadata_records_adaptive_trail(self, target, selector, seed_grou assert result.metadata["adaptive_context"] == GLOBAL_CONTEXT async def test_returns_fresh_result_distinct_from_inner(self, target, selector, seed_group): - # The dispatcher must NOT return the inner attack's ``AttackResult`` - # instance — doing so would cause a duplicate-PK insert when both the - # inner and the dispatcher's ``execute_async`` post-execute hooks try - # to persist the same row. Verify the returned result has a fresh - # ``attack_result_id`` while preserving the inner's identifying fields - # and stamping the dispatch trail. bundles = {"a": _make_bundle(name="a", outcomes=[AttackOutcome.SUCCESS])} dispatcher = AdaptiveDispatchAttack( objective_target=target, techniques=bundles, selector=selector, - seed_group=seed_group, ) inner_ids: list[str] = [] - async def _spy(*, bundle, attempt_labels): + async def _spy(*, bundle, seed_group, attempt_labels): inner_result = AttackResult( conversation_id="conv-a-0", objective="obj", @@ -287,7 +285,6 @@ def test_validate_rejects_empty_objective(self, target, selector, seed_group, ba objective_target=target, techniques={"a": _make_bundle(name="a", outcomes=[AttackOutcome.SUCCESS])}, selector=selector, - seed_group=seed_group, ) with pytest.raises(ValueError, match="objective"): dispatcher._validate_context(context=_make_context(objective=bad_objective)) @@ -297,7 +294,6 @@ def test_validate_accepts_normal_objective(self, target, selector, seed_group): objective_target=target, techniques={"a": _make_bundle(name="a", outcomes=[AttackOutcome.SUCCESS])}, selector=selector, - seed_group=seed_group, ) # Does not raise. dispatcher._validate_context(context=_make_context(objective="ok")) diff --git a/tests/unit/scenario/scenarios/adaptive/test_selector.py b/tests/unit/scenario/scenarios/adaptive/test_epsilon_greedy.py similarity index 81% rename from tests/unit/scenario/scenarios/adaptive/test_selector.py rename to tests/unit/scenario/scenarios/adaptive/test_epsilon_greedy.py index 2daba3b70..80c2eec8b 100644 --- a/tests/unit/scenario/scenarios/adaptive/test_selector.py +++ b/tests/unit/scenario/scenarios/adaptive/test_epsilon_greedy.py @@ -1,48 +1,42 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -import random -from unittest.mock import MagicMock - import pytest -from pyrit.scenario.scenarios.adaptive.selector import ( +from pyrit.scenario.scenarios.adaptive.selectors import ( GLOBAL_CONTEXT, - UNCATEGORIZED_CONTEXT, - AdaptiveTechniqueSelector, - global_context, - harm_category_context, + EpsilonGreedyTechniqueSelector, ) TECHNIQUES = ["a", "b", "c", "d"] -def _seeded_selector(*, epsilon: float = 0.0, pool_threshold: int = 3, seed: int = 0) -> AdaptiveTechniqueSelector: - return AdaptiveTechniqueSelector( +def _seeded_selector(*, epsilon: float = 0.0, pool_threshold: int = 3, random_seed: int = 0) -> EpsilonGreedyTechniqueSelector: + return EpsilonGreedyTechniqueSelector( epsilon=epsilon, pool_threshold=pool_threshold, - rng=random.Random(seed), + random_seed=random_seed, ) -class TestAdaptiveTechniqueSelectorInit: +class TestEpsilonGreedyTechniqueSelectorInit: def test_init_defaults(self): - selector = AdaptiveTechniqueSelector() + selector = EpsilonGreedyTechniqueSelector() assert selector.snapshot() == {} @pytest.mark.parametrize("bad_epsilon", [-0.1, 1.1, 2.0, -1.0]) def test_init_rejects_out_of_range_epsilon(self, bad_epsilon): with pytest.raises(ValueError, match="epsilon"): - AdaptiveTechniqueSelector(epsilon=bad_epsilon) + EpsilonGreedyTechniqueSelector(epsilon=bad_epsilon) def test_init_rejects_pool_threshold_below_one(self): with pytest.raises(ValueError, match="pool_threshold"): - AdaptiveTechniqueSelector(pool_threshold=0) + EpsilonGreedyTechniqueSelector(pool_threshold=0) with pytest.raises(ValueError, match="pool_threshold"): - AdaptiveTechniqueSelector(pool_threshold=-1) + EpsilonGreedyTechniqueSelector(pool_threshold=-1) -class TestAdaptiveTechniqueSelectorSelect: +class TestEpsilonGreedyTechniqueSelectorSelect: def test_select_empty_techniques_raises(self): selector = _seeded_selector() with pytest.raises(ValueError, match="techniques"): @@ -52,7 +46,7 @@ def test_select_all_unseen_ties_resolved_randomly(self): # With epsilon=0 and an empty table, every technique has estimate 1/1=1.0, # so the result is the seeded random tiebreak. Different seeds should # be able to produce different winners. - winners = {_seeded_selector(seed=s).select(context=GLOBAL_CONTEXT, techniques=TECHNIQUES) for s in range(50)} + winners = {_seeded_selector(random_seed=s).select(context=GLOBAL_CONTEXT, techniques=TECHNIQUES) for s in range(50)} assert len(winners) > 1 assert winners.issubset(set(TECHNIQUES)) @@ -102,7 +96,7 @@ def test_select_cold_start_round_robins(self): assert sorted(tried) == sorted(TECHNIQUES) -class TestAdaptiveTechniqueSelectorUpdate: +class TestEpsilonGreedyTechniqueSelectorUpdate: def test_record_outcome_accumulates_counts(self): selector = _seeded_selector() selector.record_outcome(context="ctx", technique="a", success=True) @@ -139,7 +133,7 @@ def test_record_outcome_keeps_pooled_global_counts_in_sync(self): assert selector.success_rate(context="new_ctx", technique="c") == pytest.approx(1.0) -class TestAdaptiveTechniqueSelectorEstimate: +class TestEpsilonGreedyTechniqueSelectorEstimate: def test_success_rate_unseen_is_one(self): # Optimistic init: (0 + 1) / (0 + 1) = 1.0 selector = _seeded_selector() @@ -163,34 +157,7 @@ def test_success_rate_pools_when_below_threshold(self): assert selector.success_rate(context="ctx", technique="a") == pytest.approx(11 / 12) -class TestContextExtractors: - def test_global_context_is_constant(self): - sg = MagicMock() - assert global_context(sg) == GLOBAL_CONTEXT - - def test_harm_category_context_joins_sorted_categories(self): - sg = MagicMock() - sg.harm_categories = ["violence", "hate"] - # Multi-category seeds form their own bucket; sorting keeps the key deterministic. - assert harm_category_context(sg) == "hate|violence" - - def test_harm_category_context_single_category(self): - sg = MagicMock() - sg.harm_categories = ["violence"] - assert harm_category_context(sg) == "violence" - - def test_harm_category_context_falls_back_when_empty(self): - sg = MagicMock() - sg.harm_categories = [] - assert harm_category_context(sg) == UNCATEGORIZED_CONTEXT - - def test_harm_category_context_falls_back_when_none(self): - sg = MagicMock() - sg.harm_categories = None - assert harm_category_context(sg) == UNCATEGORIZED_CONTEXT - - -class TestAdaptiveTechniqueSelectorConcurrency: +class TestEpsilonGreedyTechniqueSelectorConcurrency: """Concurrent record_outcome / select calls must not corrupt counts.""" def test_concurrent_record_outcome_preserves_total_attempts(self): diff --git a/tests/unit/scenario/scenarios/adaptive/test_protocol.py b/tests/unit/scenario/scenarios/adaptive/test_protocol.py new file mode 100644 index 000000000..5d9b764e7 --- /dev/null +++ b/tests/unit/scenario/scenarios/adaptive/test_protocol.py @@ -0,0 +1,46 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +from unittest.mock import MagicMock + +from pyrit.scenario.scenarios.adaptive.selectors import ( + GLOBAL_CONTEXT, + UNCATEGORIZED_CONTEXT, + EpsilonGreedyTechniqueSelector, + TechniqueSelector, + global_context, + harm_category_context, +) + + +class TestTechniqueSelectorProtocol: + def test_implements_protocol(self): + selector = EpsilonGreedyTechniqueSelector() + assert isinstance(selector, TechniqueSelector) + + +class TestContextExtractors: + def test_global_context_is_constant(self): + sg = MagicMock() + assert global_context(sg) == GLOBAL_CONTEXT + + def test_harm_category_context_joins_sorted_categories(self): + sg = MagicMock() + sg.harm_categories = ["violence", "hate"] + # Multi-category seeds form their own bucket; sorting keeps the key deterministic. + assert harm_category_context(sg) == "hate|violence" + + def test_harm_category_context_single_category(self): + sg = MagicMock() + sg.harm_categories = ["violence"] + assert harm_category_context(sg) == "violence" + + def test_harm_category_context_falls_back_when_empty(self): + sg = MagicMock() + sg.harm_categories = [] + assert harm_category_context(sg) == UNCATEGORIZED_CONTEXT + + def test_harm_category_context_falls_back_when_none(self): + sg = MagicMock() + sg.harm_categories = None + assert harm_category_context(sg) == UNCATEGORIZED_CONTEXT diff --git a/tests/unit/scenario/scenarios/adaptive/test_text_adaptive.py b/tests/unit/scenario/scenarios/adaptive/test_text_adaptive.py index 12b1a45e2..13c8fd97b 100644 --- a/tests/unit/scenario/scenarios/adaptive/test_text_adaptive.py +++ b/tests/unit/scenario/scenarios/adaptive/test_text_adaptive.py @@ -14,12 +14,11 @@ from pyrit.prompt_target import PromptTarget from pyrit.registry.object_registries.attack_technique_registry import AttackTechniqueRegistry from pyrit.scenario.core.dataset_configuration import DatasetConfiguration -from pyrit.scenario.core.scenario import BaselinePolicy +from pyrit.scenario.core.scenario import BaselineAttackPolicy from pyrit.scenario.scenarios.adaptive.dispatcher import ( - ADAPTIVE_CONTEXT_LABEL, AdaptiveDispatchAttack, ) -from pyrit.scenario.scenarios.adaptive.selector import ( +from pyrit.scenario.scenarios.adaptive.selectors import ( GLOBAL_CONTEXT, harm_category_context, ) @@ -110,8 +109,8 @@ class TestTextAdaptiveBasics: def test_version(self): assert TextAdaptive.VERSION == 1 - def test_baseline_forbidden(self): - assert TextAdaptive.BASELINE_POLICY is BaselinePolicy.Forbidden + def test_baseline_enabled(self): + assert TextAdaptive.BASELINE_ATTACK_POLICY is BaselineAttackPolicy.Enabled def test_default_dataset_config(self): config = TextAdaptive.default_dataset_config() @@ -134,16 +133,13 @@ def test_get_default_strategy(self): @patch("pyrit.scenario.core.scenario.Scenario._get_default_objective_scorer") def test_init_stores_adaptive_params(self, mock_get_scorer, mock_objective_scorer): mock_get_scorer.return_value = mock_objective_scorer - scenario = TextAdaptive( - epsilon=0.4, - pool_threshold=5, - max_attempts_per_objective=7, - seed=42, + scenario = TextAdaptive() + scenario.set_params_from_args( + args={ + "max_attempts_per_objective": 7, + } ) - assert scenario._epsilon == 0.4 - assert scenario._pool_threshold == 5 - assert scenario._max_attempts_per_objective == 7 - assert scenario._seed == 42 + assert scenario.params["max_attempts_per_objective"] == 7 @pytest.mark.usefixtures(*FIXTURES) @@ -169,7 +165,7 @@ async def _build_scenario_and_attacks( ) return scenario, await scenario._get_atomic_attacks_async() - async def test_one_atomic_per_objective(self, mock_objective_target, mock_objective_scorer): + async def test_one_atomic_per_dataset(self, mock_objective_target, mock_objective_scorer): groups = { "violence": [ _make_seed_group(value="obj-v1", harm_categories=["violence"]), @@ -184,10 +180,10 @@ async def test_one_atomic_per_objective(self, mock_objective_target, mock_object mock_objective_scorer=mock_objective_scorer, seed_groups=groups, ) - assert len(attacks) == 3 - for atomic in attacks: - # Each atomic carries exactly one seed group. - assert len(atomic.objectives) == 1 + # One atomic per dataset, carrying all that dataset's seed groups. + assert len(attacks) == 2 + total_seed_groups = sum(len(a.seed_groups) for a in attacks) + assert total_seed_groups == 3 async def test_atomics_share_one_selector_across_dispatchers(self, mock_objective_target, mock_objective_scorer): groups = { @@ -195,6 +191,9 @@ async def test_atomics_share_one_selector_across_dispatchers(self, mock_objectiv _make_seed_group(value="obj-v1", harm_categories=["violence"]), _make_seed_group(value="obj-v2", harm_categories=["violence"]), ], + "hate": [ + _make_seed_group(value="obj-h1", harm_categories=["hate"]), + ], } _scenario, attacks = await self._build_scenario_and_attacks( mock_objective_target=mock_objective_target, @@ -202,15 +201,17 @@ async def test_atomics_share_one_selector_across_dispatchers(self, mock_objectiv seed_groups=groups, ) dispatchers = [atomic._attack_technique.attack for atomic in attacks] - # Each objective gets its own dispatcher (bound to its own seed group)... + # One dispatcher per dataset (atomic). assert len({id(d) for d in dispatchers}) == len(attacks) for d in dispatchers: assert isinstance(d, AdaptiveDispatchAttack) - # ...but they all share the same selector so learning is global. + # All dispatchers share the same selector so learning is global. selectors = {id(d._selector) for d in dispatchers} assert len(selectors) == 1 - async def test_global_context_label_when_using_global_extractor(self, mock_objective_target, mock_objective_scorer): + async def test_default_context_extractor_is_global(self, mock_objective_target, mock_objective_scorer): + from pyrit.scenario.scenarios.adaptive.selectors import global_context + groups = { "violence": [_make_seed_group(value="obj-1", harm_categories=["violence"])], "hate": [_make_seed_group(value="obj-2", harm_categories=["hate"])], @@ -221,9 +222,14 @@ async def test_global_context_label_when_using_global_extractor(self, mock_objec seed_groups=groups, ) for atomic in attacks: - assert atomic._memory_labels[ADAPTIVE_CONTEXT_LABEL] == GLOBAL_CONTEXT - - async def test_harm_category_extractor_partitions_labels(self, mock_objective_target, mock_objective_scorer): + dispatcher = atomic._attack_technique.attack + # All seed groups in a global-extractor scenario resolve to the same + # context bucket regardless of harm category. + for sg in atomic.seed_groups: + assert dispatcher._context_extractor(sg) == GLOBAL_CONTEXT + assert dispatcher._context_extractor is global_context + + async def test_harm_category_extractor_partitions_contexts(self, mock_objective_target, mock_objective_scorer): groups = { "violence": [_make_seed_group(value="obj-v", harm_categories=["violence"])], "hate": [_make_seed_group(value="obj-h", harm_categories=["hate"])], @@ -235,21 +241,29 @@ async def test_harm_category_extractor_partitions_labels(self, mock_objective_ta seed_groups=groups, context_extractor=harm_category_context, ) - contexts = {atomic._memory_labels[ADAPTIVE_CONTEXT_LABEL] for atomic in attacks} - # Each objective gets its own context bucket from harm_category_context. + contexts: set[str] = set() + for atomic in attacks: + dispatcher = atomic._attack_technique.attack + assert dispatcher._context_extractor is harm_category_context + for sg in atomic.seed_groups: + contexts.add(dispatcher._context_extractor(sg)) + # Each harm category gets its own context bucket. assert contexts == {"violence", "hate", "_uncategorized"} - async def test_atomic_names_are_unique(self, mock_objective_target, mock_objective_scorer): + async def test_atomic_names_are_dataset_scoped(self, mock_objective_target, mock_objective_scorer): groups = { "violence": [_make_seed_group(value=f"obj-{i}", harm_categories=["violence"]) for i in range(5)], + "hate": [_make_seed_group(value=f"hate-{i}", harm_categories=["hate"]) for i in range(3)], } _scenario, attacks = await self._build_scenario_and_attacks( mock_objective_target=mock_objective_target, mock_objective_scorer=mock_objective_scorer, seed_groups=groups, ) - names = [atomic.atomic_attack_name for atomic in attacks] - assert len(set(names)) == len(names) + names = {atomic.atomic_attack_name for atomic in attacks} + # One atomic name per dataset; the dataset name is embedded. + assert len(names) == len(groups) + assert all(any(ds in name for ds in groups) for name in names) async def test_display_group_is_dataset_name(self, mock_objective_target, mock_objective_scorer): groups = { @@ -290,14 +304,13 @@ async def test_techniques_with_seed_technique_are_kept(self, mock_objective_targ patch.object(SeedAttackGroup, "is_compatible_with_technique", return_value=True), ): scenario = TextAdaptive(objective_scorer=mock_objective_scorer) - with patch.object( - scenario, - "_get_attack_technique_factories", - return_value={"prompt_sending": plain_factory, "many_shot": seeded_factory}, - ): + strategy_class = scenario.get_strategy_class() + factories = {"role_play": plain_factory, "many_shot": seeded_factory} + with patch.object(scenario, "_get_attack_technique_factories", return_value=factories): await scenario.initialize_async( objective_target=mock_objective_target, include_baseline=False, + scenario_strategies=[strategy_class("role_play"), strategy_class("many_shot")], ) attacks = scenario._atomic_attacks @@ -306,7 +319,7 @@ async def test_techniques_with_seed_technique_are_kept(self, mock_objective_targ assert isinstance(dispatcher, AdaptiveDispatchAttack) # Both factories survive; in particular the seeded one is no longer # silently dropped. - assert "prompt_sending" in dispatcher._techniques + assert "role_play" in dispatcher._techniques assert "many_shot" in dispatcher._techniques async def test_incompatible_seed_technique_is_filtered_per_objective( @@ -324,23 +337,26 @@ async def test_incompatible_seed_technique_is_filtered_per_objective( patch.object(SeedAttackGroup, "is_compatible_with_technique", return_value=False), ): scenario = TextAdaptive(objective_scorer=mock_objective_scorer) - with patch.object( - scenario, - "_get_attack_technique_factories", - return_value={"prompt_sending": plain_factory, "many_shot": incompatible_factory}, - ): + strategy_class = scenario.get_strategy_class() + factories = {"role_play": plain_factory, "many_shot": incompatible_factory} + with patch.object(scenario, "_get_attack_technique_factories", return_value=factories): await scenario.initialize_async( objective_target=mock_objective_target, include_baseline=False, + scenario_strategies=[strategy_class("role_play"), strategy_class("many_shot")], ) attacks = scenario._atomic_attacks assert len(attacks) == 1 dispatcher = attacks[0]._attack_technique.attack - # Only the plain technique survives; the seed_technique-bearing one is filtered out - # because is_compatible_with_technique returned False. - assert "prompt_sending" in dispatcher._techniques - assert "many_shot" not in dispatcher._techniques + # Under the one-atomic-per-dataset design, the full technique pool is + # shared by the dispatcher; per-call compatibility filtering now + # happens inside ``AdaptiveDispatchAttack._perform_async``. The seed + # group survived because the plain (no-seed_technique) factory keeps + # the compatible pool non-empty. + assert "role_play" in dispatcher._techniques + assert "many_shot" in dispatcher._techniques + assert len(attacks[0].seed_groups) == 1 async def test_objective_skipped_when_no_compatible_techniques( self, mock_objective_target, mock_objective_scorer, caplog @@ -364,10 +380,11 @@ def _selective_compat(self_group, *, technique): patch.object(SeedAttackGroup, "is_compatible_with_technique", _selective_compat), ): scenario = TextAdaptive(objective_scorer=mock_objective_scorer) + strategy_class = scenario.get_strategy_class() with patch.object( scenario, "_get_attack_technique_factories", - return_value={"prompt_sending": seeded_factory}, + return_value={"role_play": seeded_factory}, ): import logging @@ -375,6 +392,7 @@ def _selective_compat(self_group, *, technique): await scenario.initialize_async( objective_target=mock_objective_target, include_baseline=False, + scenario_strategies=[strategy_class("role_play")], ) attacks = scenario._atomic_attacks @@ -392,49 +410,46 @@ def _build_scenario_no_resume_id(self, *, scorer): return TextAdaptive(objective_scorer=scorer) def test_no_scenario_result_id_is_noop(self, mock_objective_scorer): - from pyrit.scenario.scenarios.adaptive.selector import AdaptiveTechniqueSelector + from pyrit.scenario.scenarios.adaptive.selectors import EpsilonGreedyTechniqueSelector scenario = TextAdaptive(objective_scorer=mock_objective_scorer) - selector = AdaptiveTechniqueSelector() + selector = EpsilonGreedyTechniqueSelector() # No scenario_result_id set -> early return, no errors, no replays. scenario._rehydrate_selector_from_memory(selector=selector, known_techniques={"a", "b"}) assert selector.snapshot() == {} def test_replays_attempts_from_metadata(self, mock_objective_scorer): from pyrit.models import AttackResult - from pyrit.scenario.scenarios.adaptive.selector import AdaptiveTechniqueSelector + from pyrit.scenario.scenarios.adaptive.selectors import EpsilonGreedyTechniqueSelector scenario = TextAdaptive(objective_scorer=mock_objective_scorer, scenario_result_id="rid") - prior_result = MagicMock() - prior_result.attack_results = { - "adaptive_violence_o1": [ - AttackResult( - conversation_id="c1", - objective="o1", - metadata={ - "adaptive_attempts": [ - {"technique": "a", "outcome": "failure"}, - {"technique": "b", "outcome": "success"}, - ], - "adaptive_context": "violence", - }, - ), - ], - "adaptive_hate_o2": [ - AttackResult( - conversation_id="c2", - objective="o2", - metadata={ - "adaptive_attempts": [{"technique": "a", "outcome": "success"}], - "adaptive_context": "hate", - }, - ), - ], - } - - selector = AdaptiveTechniqueSelector() - with patch.object(scenario._memory, "get_scenario_results", return_value=[prior_result]): + rows = [ + AttackResult( + conversation_id="c1", + objective="o1", + attribution_data={"parent_collection": "adaptive_violence"}, + metadata={ + "adaptive_attempts": [ + {"technique": "a", "outcome": "failure"}, + {"technique": "b", "outcome": "success"}, + ], + "adaptive_context": "violence", + }, + ), + AttackResult( + conversation_id="c2", + objective="o2", + attribution_data={"parent_collection": "adaptive_hate"}, + metadata={ + "adaptive_attempts": [{"technique": "a", "outcome": "success"}], + "adaptive_context": "hate", + }, + ), + ] + + selector = EpsilonGreedyTechniqueSelector() + with patch.object(scenario._memory, "get_attack_results", return_value=rows): scenario._rehydrate_selector_from_memory(selector=selector, known_techniques={"a", "b"}) # Trails replayed verbatim into the per-context table. @@ -444,28 +459,26 @@ def test_replays_attempts_from_metadata(self, mock_objective_scorer): def test_skips_unknown_techniques(self, mock_objective_scorer): from pyrit.models import AttackResult - from pyrit.scenario.scenarios.adaptive.selector import AdaptiveTechniqueSelector + from pyrit.scenario.scenarios.adaptive.selectors import EpsilonGreedyTechniqueSelector scenario = TextAdaptive(objective_scorer=mock_objective_scorer, scenario_result_id="rid") - prior_result = MagicMock() - prior_result.attack_results = { - "x": [ - AttackResult( - conversation_id="c1", - objective="o1", - metadata={ - "adaptive_attempts": [ - {"technique": "removed_technique", "outcome": "success"}, - {"technique": "a", "outcome": "failure"}, - ], - "adaptive_context": "ctx", - }, - ), - ], - } - - selector = AdaptiveTechniqueSelector() - with patch.object(scenario._memory, "get_scenario_results", return_value=[prior_result]): + rows = [ + AttackResult( + conversation_id="c1", + objective="o1", + attribution_data={"parent_collection": "adaptive_violence"}, + metadata={ + "adaptive_attempts": [ + {"technique": "removed_technique", "outcome": "success"}, + {"technique": "a", "outcome": "failure"}, + ], + "adaptive_context": "ctx", + }, + ), + ] + + selector = EpsilonGreedyTechniqueSelector() + with patch.object(scenario._memory, "get_attack_results", return_value=rows): scenario._rehydrate_selector_from_memory(selector=selector, known_techniques={"a"}) # Only the known technique was recorded. @@ -474,26 +487,30 @@ def test_skips_unknown_techniques(self, mock_objective_scorer): def test_ignores_results_without_adaptive_metadata(self, mock_objective_scorer): from pyrit.models import AttackResult - from pyrit.scenario.scenarios.adaptive.selector import AdaptiveTechniqueSelector + from pyrit.scenario.scenarios.adaptive.selectors import EpsilonGreedyTechniqueSelector scenario = TextAdaptive(objective_scorer=mock_objective_scorer, scenario_result_id="rid") - prior_result = MagicMock() - prior_result.attack_results = { - "baseline": [AttackResult(conversation_id="c", objective="o", metadata={})], - } - - selector = AdaptiveTechniqueSelector() - with patch.object(scenario._memory, "get_scenario_results", return_value=[prior_result]): + rows = [ + AttackResult( + conversation_id="c", + objective="o", + attribution_data={"parent_collection": "adaptive_violence"}, + metadata={}, + ), + ] + + selector = EpsilonGreedyTechniqueSelector() + with patch.object(scenario._memory, "get_attack_results", return_value=rows): scenario._rehydrate_selector_from_memory(selector=selector, known_techniques={"a"}) assert selector.snapshot() == {} def test_memory_load_failure_is_swallowed(self, mock_objective_scorer): - from pyrit.scenario.scenarios.adaptive.selector import AdaptiveTechniqueSelector + from pyrit.scenario.scenarios.adaptive.selectors import EpsilonGreedyTechniqueSelector scenario = TextAdaptive(objective_scorer=mock_objective_scorer, scenario_result_id="rid") - selector = AdaptiveTechniqueSelector() - with patch.object(scenario._memory, "get_scenario_results", side_effect=RuntimeError("db down")): + selector = EpsilonGreedyTechniqueSelector() + with patch.object(scenario._memory, "get_attack_results", side_effect=RuntimeError("db down")): # Must not raise; selector remains empty. scenario._rehydrate_selector_from_memory(selector=selector, known_techniques={"a"}) assert selector.snapshot() == {} @@ -501,12 +518,12 @@ def test_memory_load_failure_is_swallowed(self, mock_objective_scorer): @pytest.mark.usefixtures(*FIXTURES) class TestTextAdaptiveBaselinePolicy: - async def test_initialize_async_rejects_explicit_baseline(self, mock_objective_target, mock_objective_scorer): + async def test_initialize_async_accepts_explicit_baseline(self, mock_objective_target, mock_objective_scorer): groups = {"violence": [_make_seed_group(value="obj", harm_categories=["violence"])]} with patch.object(DatasetConfiguration, "get_seed_attack_groups", return_value=groups): scenario = TextAdaptive(objective_scorer=mock_objective_scorer) - with pytest.raises(ValueError): - await scenario.initialize_async( - objective_target=mock_objective_target, - include_baseline=True, - ) + # Baseline is Enabled by default, so explicit include_baseline=True must not raise. + await scenario.initialize_async( + objective_target=mock_objective_target, + include_baseline=True, + ) From 26cd65e130e48baf794dff69c035b36d39cff333 Mon Sep 17 00:00:00 2001 From: hannahwestra25 Date: Thu, 21 May 2026 17:09:50 -0400 Subject: [PATCH 11/12] fix: address pre-commit lint failures - SIM108: use ternary for selector assignment - D101: add docstring to AdaptiveDispatchParams - DOC201/DOC501: add Returns/Raises sections to docstrings - TC003: move Sequence import into TYPE_CHECKING block - Fix trailing newline in epsilon_greedy.py Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- doc/code/scenarios/3_adaptive_scenarios.ipynb | 8 ++----- doc/code/scenarios/3_adaptive_scenarios.py | 8 ++----- .../scenarios/adaptive/adaptive_scenario.py | 10 ++++----- .../scenario/scenarios/adaptive/dispatcher.py | 21 ++++++++++++------- .../adaptive/selectors/epsilon_greedy.py | 11 ++++++---- .../scenarios/adaptive/test_epsilon_greedy.py | 8 +++++-- 6 files changed, 34 insertions(+), 32 deletions(-) diff --git a/doc/code/scenarios/3_adaptive_scenarios.ipynb b/doc/code/scenarios/3_adaptive_scenarios.ipynb index 88996496e..f3972653e 100644 --- a/doc/code/scenarios/3_adaptive_scenarios.ipynb +++ b/doc/code/scenarios/3_adaptive_scenarios.ipynb @@ -131,9 +131,7 @@ " context_extractor=harm_category_context,\n", " selector=EpsilonGreedyTechniqueSelector(epsilon=0.3, random_seed=42),\n", ")\n", - "configured_scenario.set_params_from_args(\n", - " args={\"max_attempts_per_objective\": 5}\n", - ")\n", + "configured_scenario.set_params_from_args(args={\"max_attempts_per_objective\": 5})\n", "\n", "await configured_scenario.initialize_async( # type: ignore\n", " objective_target=objective_target,\n", @@ -171,9 +169,7 @@ " selector=EpsilonGreedyTechniqueSelector(epsilon=0.3, random_seed=42),\n", " scenario_result_id=str(configured_result.id),\n", ")\n", - "resumed_scenario.set_params_from_args(\n", - " args={\"max_attempts_per_objective\": 5}\n", - ")\n", + "resumed_scenario.set_params_from_args(args={\"max_attempts_per_objective\": 5})\n", "\n", "await resumed_scenario.initialize_async( # type: ignore\n", " objective_target=objective_target,\n", diff --git a/doc/code/scenarios/3_adaptive_scenarios.py b/doc/code/scenarios/3_adaptive_scenarios.py index a0d38ec30..e4c8e211a 100644 --- a/doc/code/scenarios/3_adaptive_scenarios.py +++ b/doc/code/scenarios/3_adaptive_scenarios.py @@ -99,9 +99,7 @@ context_extractor=harm_category_context, selector=EpsilonGreedyTechniqueSelector(epsilon=0.3, random_seed=42), ) -configured_scenario.set_params_from_args( - args={"max_attempts_per_objective": 5} -) +configured_scenario.set_params_from_args(args={"max_attempts_per_objective": 5}) await configured_scenario.initialize_async( # type: ignore objective_target=objective_target, @@ -127,9 +125,7 @@ selector=EpsilonGreedyTechniqueSelector(epsilon=0.3, random_seed=42), scenario_result_id=str(configured_result.id), ) -resumed_scenario.set_params_from_args( - args={"max_attempts_per_objective": 5} -) +resumed_scenario.set_params_from_args(args={"max_attempts_per_objective": 5}) await resumed_scenario.initialize_async( # type: ignore objective_target=objective_target, diff --git a/pyrit/scenario/scenarios/adaptive/adaptive_scenario.py b/pyrit/scenario/scenarios/adaptive/adaptive_scenario.py index 862565d8e..f961071d1 100644 --- a/pyrit/scenario/scenarios/adaptive/adaptive_scenario.py +++ b/pyrit/scenario/scenarios/adaptive/adaptive_scenario.py @@ -28,8 +28,8 @@ TechniqueBundle, ) from pyrit.scenario.scenarios.adaptive.selectors import ( - EpsilonGreedyTechniqueSelector, ContextExtractor, + EpsilonGreedyTechniqueSelector, TechniqueSelector, global_context, ) @@ -123,11 +123,9 @@ async def _get_atomic_attacks_async(self) -> list[AtomicAttack]: techniques = self._build_techniques_dict(objective_target=self._objective_target) - selector: TechniqueSelector - if self._custom_selector is not None: - selector = self._custom_selector - else: - selector = EpsilonGreedyTechniqueSelector() + selector: TechniqueSelector = ( + self._custom_selector if self._custom_selector is not None else EpsilonGreedyTechniqueSelector() + ) # On resume, replay prior attempt outcomes from persisted metadata. self._rehydrate_selector_from_memory(selector=selector, known_techniques=set(techniques)) diff --git a/pyrit/scenario/scenarios/adaptive/dispatcher.py b/pyrit/scenario/scenarios/adaptive/dispatcher.py index a8e671cdd..cd26a3b36 100644 --- a/pyrit/scenario/scenarios/adaptive/dispatcher.py +++ b/pyrit/scenario/scenarios/adaptive/dispatcher.py @@ -70,6 +70,8 @@ class TechniqueBundle: @dataclass(frozen=True) class AdaptiveDispatchParams(AttackParameters): + """Attack parameters for adaptive dispatch, carrying the original seed group.""" + # The original SeedAttackGroup is preserved on the params so the # dispatcher can apply per-attempt seed_technique merging and derive # the per-call adaptive context. Captured by ``from_seed_group_async``; @@ -81,10 +83,10 @@ async def from_seed_group_async( cls, *, seed_group: SeedAttackGroup, - adversarial_chat: Optional["PromptTarget"] = None, # noqa: ARG003 — required by base class signature - objective_scorer: Optional["TrueFalseScorer"] = None, # noqa: ARG003 — required by base class signature + adversarial_chat: Optional[PromptTarget] = None, # noqa: ARG003 — required by base class signature + objective_scorer: Optional[TrueFalseScorer] = None, # noqa: ARG003 — required by base class signature **overrides: Any, - ) -> "AdaptiveDispatchParams": + ) -> AdaptiveDispatchParams: """ Build params for a single dispatch and capture the original seed_group. @@ -93,6 +95,12 @@ async def from_seed_group_async( expansion / next_message extraction: the inner technique runs through its own ``execute_attack_from_seed_groups_async`` call which performs that work using the technique-merged seed_group. + + Returns: + AdaptiveDispatchParams: The constructed parameters with the seed group attached. + + Raises: + ValueError: If the seed_group's objective is not initialized or invalid overrides are passed. """ if seed_group.objective is None: raise ValueError("seed_group.objective is not initialized") @@ -101,9 +109,7 @@ async def from_seed_group_async( valid_fields = {f.name for f in dataclasses.fields(cls)} - {"seed_group"} invalid = set(overrides.keys()) - valid_fields if invalid: - raise ValueError( - f"{cls.__name__} does not accept parameters: {invalid}. Accepted: {valid_fields}" - ) + raise ValueError(f"{cls.__name__} does not accept parameters: {invalid}. Accepted: {valid_fields}") return cls( objective=seed_group.objective.value, @@ -294,8 +300,7 @@ async def _perform_async(self, *, context: AdaptiveDispatchContext) -> AttackRes compatible_names = [ name for name, bundle in self._techniques.items() - if bundle.seed_technique is None - or seed_group.is_compatible_with_technique(technique=bundle.seed_technique) + if bundle.seed_technique is None or seed_group.is_compatible_with_technique(technique=bundle.seed_technique) ] if not compatible_names: raise ValueError( diff --git a/pyrit/scenario/scenarios/adaptive/selectors/epsilon_greedy.py b/pyrit/scenario/scenarios/adaptive/selectors/epsilon_greedy.py index ec6152097..9ff7e9f6b 100644 --- a/pyrit/scenario/scenarios/adaptive/selectors/epsilon_greedy.py +++ b/pyrit/scenario/scenarios/adaptive/selectors/epsilon_greedy.py @@ -9,15 +9,19 @@ import random import struct import threading -from collections.abc import Sequence +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Sequence def _derive_rng(random_seed: int | None, context: str, decision_key: str) -> random.Random: """ Derive a per-decision ``Random`` from ``(random_seed, context, decision_key)``. - Returns a fresh ``random.Random`` seeded deterministically from the - inputs when ``random_seed`` is not None, or an unseeded ``Random`` otherwise. + Returns: + random.Random: A fresh ``random.Random`` seeded deterministically from the + inputs when ``random_seed`` is not None, or an unseeded ``Random`` otherwise. """ if random_seed is None: return random.Random() @@ -176,4 +180,3 @@ def _estimate(self, *, context: str, technique: str) -> float: return (local_s + 1) / (local_n + 1) global_s, global_n = self._global_counts.get(technique, (0, 0)) return (global_s + 1) / (global_n + 1) - diff --git a/tests/unit/scenario/scenarios/adaptive/test_epsilon_greedy.py b/tests/unit/scenario/scenarios/adaptive/test_epsilon_greedy.py index 80c2eec8b..42d6b14b4 100644 --- a/tests/unit/scenario/scenarios/adaptive/test_epsilon_greedy.py +++ b/tests/unit/scenario/scenarios/adaptive/test_epsilon_greedy.py @@ -11,7 +11,9 @@ TECHNIQUES = ["a", "b", "c", "d"] -def _seeded_selector(*, epsilon: float = 0.0, pool_threshold: int = 3, random_seed: int = 0) -> EpsilonGreedyTechniqueSelector: +def _seeded_selector( + *, epsilon: float = 0.0, pool_threshold: int = 3, random_seed: int = 0 +) -> EpsilonGreedyTechniqueSelector: return EpsilonGreedyTechniqueSelector( epsilon=epsilon, pool_threshold=pool_threshold, @@ -46,7 +48,9 @@ def test_select_all_unseen_ties_resolved_randomly(self): # With epsilon=0 and an empty table, every technique has estimate 1/1=1.0, # so the result is the seeded random tiebreak. Different seeds should # be able to produce different winners. - winners = {_seeded_selector(random_seed=s).select(context=GLOBAL_CONTEXT, techniques=TECHNIQUES) for s in range(50)} + winners = { + _seeded_selector(random_seed=s).select(context=GLOBAL_CONTEXT, techniques=TECHNIQUES) for s in range(50) + } assert len(winners) > 1 assert winners.issubset(set(TECHNIQUES)) From b4db6a674735ff11dada052072cca026c3de11f0 Mon Sep 17 00:00:00 2001 From: hannahwestra25 Date: Fri, 22 May 2026 18:20:15 -0400 Subject: [PATCH 12/12] Redesign TechniqueSelector: stateless, memory-backed, eval-hash keyed - Make TechniqueSelector stateless: queries memory instead of internal counts - Identify techniques by AttackTechnique eval hashes instead of names - Pre-select K techniques via num_top_techniques parameter - Add SelectorScope enum (ALL_RUNS / CURRENT_RUN) - Move ASR aggregation to pyrit/analytics/scenario_analysis.py - Rename protocol.py to technique_selector.py - Remove ContextExtractor, SelectorContext, redundant VERSION/BASELINE_ATTACK_POLICY - Add scanner CLI section to notebook - Rename number_to_get to num_top_techniques - Remove label_key from user-facing API (hardcode ADAPTIVE_TECHNIQUE_LABEL) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- doc/code/scenarios/3_adaptive_scenarios.ipynb | 78 ++++-- doc/code/scenarios/3_adaptive_scenarios.py | 51 ++-- pyrit/analytics/scenario_analysis.py | 67 +++++ pyrit/scenario/scenarios/adaptive/__init__.py | 10 +- .../scenarios/adaptive/adaptive_scenario.py | 115 ++------ .../scenario/scenarios/adaptive/dispatcher.py | 94 +++---- .../scenarios/adaptive/selectors/__init__.py | 18 +- .../adaptive/selectors/epsilon_greedy.py | 204 ++++++-------- .../scenarios/adaptive/selectors/protocol.py | 66 ----- .../adaptive/selectors/technique_selector.py | 64 +++++ .../scenarios/adaptive/text_adaptive.py | 11 +- .../unit/analytics/test_scenario_analysis.py | 111 ++++++++ .../scenarios/adaptive/test_dispatcher.py | 134 ++++----- .../scenarios/adaptive/test_epsilon_greedy.py | 260 +++++++----------- .../scenarios/adaptive/test_protocol.py | 46 ---- .../adaptive/test_technique_selector.py | 13 + .../scenarios/adaptive/test_text_adaptive.py | 173 +----------- 17 files changed, 655 insertions(+), 860 deletions(-) create mode 100644 pyrit/analytics/scenario_analysis.py delete mode 100644 pyrit/scenario/scenarios/adaptive/selectors/protocol.py create mode 100644 pyrit/scenario/scenarios/adaptive/selectors/technique_selector.py create mode 100644 tests/unit/analytics/test_scenario_analysis.py delete mode 100644 tests/unit/scenario/scenarios/adaptive/test_protocol.py create mode 100644 tests/unit/scenario/scenarios/adaptive/test_technique_selector.py diff --git a/doc/code/scenarios/3_adaptive_scenarios.ipynb b/doc/code/scenarios/3_adaptive_scenarios.ipynb index f3972653e..f52ddf1be 100644 --- a/doc/code/scenarios/3_adaptive_scenarios.ipynb +++ b/doc/code/scenarios/3_adaptive_scenarios.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "0", + "id": "94e7f44a", "metadata": {}, "source": [ "# Adaptive Scenarios\n", @@ -39,7 +39,7 @@ }, { "cell_type": "markdown", - "id": "1", + "id": "cb716650", "metadata": {}, "source": [ "## Setup" @@ -48,7 +48,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2", + "id": "4b536900", "metadata": {}, "outputs": [], "source": [ @@ -57,7 +57,7 @@ "from pyrit.registry import TargetRegistry\n", "from pyrit.scenario import DatasetConfiguration\n", "from pyrit.scenario.printer.console_printer import ConsoleScenarioResultPrinter\n", - "from pyrit.scenario.scenarios.adaptive import TextAdaptive, harm_category_context\n", + "from pyrit.scenario.scenarios.adaptive import TextAdaptive\n", "from pyrit.setup import initialize_from_config_async\n", "\n", "await initialize_from_config_async(config_path=Path(\"../../scanner/pyrit_conf.yaml\")) # type: ignore\n", @@ -68,19 +68,19 @@ }, { "cell_type": "markdown", - "id": "3", + "id": "9f9ff786", "metadata": {}, "source": [ "## Basic usage\n", "\n", "Defaults: `max_attempts_per_objective=3`, epsilon-greedy selector with `epsilon=0.2`,\n", - "the subclass’s default datasets." + "the subclass's default datasets." ] }, { "cell_type": "code", "execution_count": null, - "id": "4", + "id": "33aa89d3", "metadata": {}, "outputs": [], "source": [ @@ -95,7 +95,7 @@ }, { "cell_type": "markdown", - "id": "5", + "id": "5083bbed", "metadata": {}, "source": [ "## Configuring a run\n", @@ -103,13 +103,9 @@ "- **`max_attempts_per_objective`** — caps techniques tried per objective. Higher means\n", " more chances to succeed and more API calls. Set via `set_params_from_args`.\n", "- **`selector`** — a pre-built `TechniqueSelector` instance. Pass an\n", - " `EpsilonGreedyTechniqueSelector(epsilon=..., pool_threshold=..., random_seed=...)`\n", - " to tune the selection algorithm. Defaults to `EpsilonGreedyTechniqueSelector()`\n", - " (`epsilon=0.2`, `pool_threshold=3`).\n", - "- **`context_extractor`** — partitions the success-rate table. The default\n", - " `global_context` keeps one shared table; `harm_category_context` learns each harm\n", - " category independently. Custom callables of type `Callable[[SeedAttackGroup], str]`\n", - " are supported.\n", + " `EpsilonGreedyTechniqueSelector(epsilon=..., random_seed=...)`\n", + " to tune the selection algorithm. Defaults to an epsilon-greedy selector with\n", + " `epsilon=0.2`.\n", "- **`scenario_strategies`** (on `initialize_async`) — restricts which techniques the\n", " selector can pick from. Use `TextAdaptive.get_strategy_class()` to access the enum.\n", "\n", @@ -119,7 +115,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6", + "id": "db966395", "metadata": {}, "outputs": [], "source": [ @@ -128,8 +124,10 @@ "strategy_class = TextAdaptive.get_strategy_class()\n", "\n", "configured_scenario = TextAdaptive(\n", - " context_extractor=harm_category_context,\n", - " selector=EpsilonGreedyTechniqueSelector(epsilon=0.3, random_seed=42),\n", + " selector=EpsilonGreedyTechniqueSelector(\n", + " epsilon=0.3,\n", + " random_seed=42,\n", + " ),\n", ")\n", "configured_scenario.set_params_from_args(args={\"max_attempts_per_objective\": 5})\n", "\n", @@ -147,26 +145,28 @@ }, { "cell_type": "markdown", - "id": "7", + "id": "ba7e7126", "metadata": {}, "source": [ "## Resuming a run\n", "\n", "Adaptive scenarios are resumable — pass `scenario_result_id=...` to the `TextAdaptive`\n", - "constructor and the run picks up where it left off, with prior outcomes replayed into\n", - "the selector. Resume must use the same configuration as the original run." + "constructor and the run picks up where it left off. Resume must use the same\n", + "configuration as the original run." ] }, { "cell_type": "code", "execution_count": null, - "id": "8", + "id": "4857bace", "metadata": {}, "outputs": [], "source": [ "resumed_scenario = TextAdaptive(\n", - " context_extractor=harm_category_context,\n", - " selector=EpsilonGreedyTechniqueSelector(epsilon=0.3, random_seed=42),\n", + " selector=EpsilonGreedyTechniqueSelector(\n", + " epsilon=0.3,\n", + " random_seed=42,\n", + " ),\n", " scenario_result_id=str(configured_result.id),\n", ")\n", "resumed_scenario.set_params_from_args(args={\"max_attempts_per_objective\": 5})\n", @@ -185,14 +185,13 @@ }, { "cell_type": "markdown", - "id": "9", + "id": "e267467c", "metadata": {}, "source": [ "## Inspecting which techniques were tried\n", "\n", "The dispatcher stamps every objective's `AttackResult.metadata` with:\n", "\n", - "- `adaptive_context` — the bucket key from the `context_extractor`.\n", "- `adaptive_attempts` — the ordered list of `{\"technique\", \"outcome\"}` dicts\n", " recording exactly which techniques the selector picked and what happened.\n", "\n", @@ -202,7 +201,7 @@ { "cell_type": "code", "execution_count": null, - "id": "10", + "id": "3a95436b", "metadata": {}, "outputs": [], "source": [ @@ -229,6 +228,31 @@ "for technique, n in picks.most_common():\n", " print(f\"{technique:20s} {wins[technique]:>4} / {n:<4} {wins[technique] / n:.0%}\")" ] + }, + { + "cell_type": "markdown", + "id": "37cd0756", + "metadata": {}, + "source": [ + "## Running from the scanner CLI\n", + "\n", + "You can run `TextAdaptive` directly from the `pyrit_scan` CLI without writing Python:\n", + "\n", + "```bash\n", + "# Basic run with defaults\n", + "pyrit_scan --scenario TextAdaptive --target openai_chat\n", + "\n", + "# Tune max attempts and restrict strategies\n", + "pyrit_scan --scenario TextAdaptive --target openai_chat \\\n", + " --params max_attempts_per_objective=5 \\\n", + " --strategies single_turn\n", + "\n", + "# Use specific datasets and limit size\n", + "pyrit_scan --scenario TextAdaptive --target openai_chat \\\n", + " --datasets airt_hate airt_violence \\\n", + " --max-dataset-size 10\n", + "```" + ] } ], "metadata": { diff --git a/doc/code/scenarios/3_adaptive_scenarios.py b/doc/code/scenarios/3_adaptive_scenarios.py index e4c8e211a..8826c4240 100644 --- a/doc/code/scenarios/3_adaptive_scenarios.py +++ b/doc/code/scenarios/3_adaptive_scenarios.py @@ -49,7 +49,7 @@ from pyrit.registry import TargetRegistry from pyrit.scenario import DatasetConfiguration from pyrit.scenario.printer.console_printer import ConsoleScenarioResultPrinter -from pyrit.scenario.scenarios.adaptive import TextAdaptive, harm_category_context +from pyrit.scenario.scenarios.adaptive import TextAdaptive from pyrit.setup import initialize_from_config_async await initialize_from_config_async(config_path=Path("../../scanner/pyrit_conf.yaml")) # type: ignore @@ -77,14 +77,10 @@ # # - **`max_attempts_per_objective`** — caps techniques tried per objective. Higher means # more chances to succeed and more API calls. Set via `set_params_from_args`. -# - **`selector`** — a pre-built ``TechniqueSelector`` instance. Pass an -# ``EpsilonGreedyTechniqueSelector(epsilon=..., pool_threshold=..., random_seed=...)`` -# to tune the selection algorithm. Defaults to ``EpsilonGreedyTechniqueSelector()`` -# (``epsilon=0.2``, ``pool_threshold=3``). -# - **`context_extractor`** — partitions the success-rate table. The default -# `global_context` keeps one shared table; `harm_category_context` learns each harm -# category independently. Custom callables of type `Callable[[SeedAttackGroup], str]` -# are supported. +# - **`selector`** — a pre-built `TechniqueSelector` instance. Pass an +# `EpsilonGreedyTechniqueSelector(epsilon=..., random_seed=...)` +# to tune the selection algorithm. Defaults to an epsilon-greedy selector with +# `epsilon=0.2`. # - **`scenario_strategies`** (on `initialize_async`) — restricts which techniques the # selector can pick from. Use `TextAdaptive.get_strategy_class()` to access the enum. # @@ -96,8 +92,10 @@ strategy_class = TextAdaptive.get_strategy_class() configured_scenario = TextAdaptive( - context_extractor=harm_category_context, - selector=EpsilonGreedyTechniqueSelector(epsilon=0.3, random_seed=42), + selector=EpsilonGreedyTechniqueSelector( + epsilon=0.3, + random_seed=42, + ), ) configured_scenario.set_params_from_args(args={"max_attempts_per_objective": 5}) @@ -116,13 +114,15 @@ # ## Resuming a run # # Adaptive scenarios are resumable — pass `scenario_result_id=...` to the `TextAdaptive` -# constructor and the run picks up where it left off, with prior outcomes replayed into -# the selector. Resume must use the same configuration as the original run. +# constructor and the run picks up where it left off. Resume must use the same +# configuration as the original run. # %% resumed_scenario = TextAdaptive( - context_extractor=harm_category_context, - selector=EpsilonGreedyTechniqueSelector(epsilon=0.3, random_seed=42), + selector=EpsilonGreedyTechniqueSelector( + epsilon=0.3, + random_seed=42, + ), scenario_result_id=str(configured_result.id), ) resumed_scenario.set_params_from_args(args={"max_attempts_per_objective": 5}) @@ -143,7 +143,6 @@ # # The dispatcher stamps every objective's `AttackResult.metadata` with: # -# - `adaptive_context` — the bucket key from the `context_extractor`. # - `adaptive_attempts` — the ordered list of `{"technique", "outcome"}` dicts # recording exactly which techniques the selector picked and what happened. # @@ -172,3 +171,23 @@ print("\nTechnique wins / picks rate") for technique, n in picks.most_common(): print(f"{technique:20s} {wins[technique]:>4} / {n:<4} {wins[technique] / n:.0%}") + +# %% [markdown] +# ## Running from the scanner CLI +# +# You can run `TextAdaptive` directly from the `pyrit_scan` CLI without writing Python: +# +# ```bash +# # Basic run with defaults +# pyrit_scan --scenario TextAdaptive --target openai_chat +# +# # Tune max attempts and restrict strategies +# pyrit_scan --scenario TextAdaptive --target openai_chat \ +# --params max_attempts_per_objective=5 \ +# --strategies single_turn +# +# # Use specific datasets and limit size +# pyrit_scan --scenario TextAdaptive --target openai_chat \ +# --datasets airt_hate airt_violence \ +# --max-dataset-size 10 +# ``` diff --git a/pyrit/analytics/scenario_analysis.py b/pyrit/analytics/scenario_analysis.py new file mode 100644 index 000000000..0903231f8 --- /dev/null +++ b/pyrit/analytics/scenario_analysis.py @@ -0,0 +1,67 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Scenario-level analytics: technique success rates and related helpers.""" + +from __future__ import annotations + +from collections.abc import Sequence + +from pyrit.analytics.result_analysis import AttackStats, _compute_stats +from pyrit.memory import CentralMemory +from pyrit.models import AttackOutcome + + +def compute_technique_success_rates( + *, + technique_hashes: Sequence[str], + label_key: str, + scenario_result_id: str | None = None, +) -> dict[str, AttackStats]: + """ + Query memory for historical success rates grouped by technique eval hash. + + Fetches all ``AttackResult`` rows whose memory labels contain + ``label_key`` matching one of ``technique_hashes``, then aggregates + outcomes into per-technique :class:`AttackStats`. + + By default queries across all scenario runs. Pass ``scenario_result_id`` + to restrict to a single run. + + Args: + technique_hashes (Sequence[str]): Technique eval hashes to query. + label_key (str): Memory-label key that stores the technique hash. + scenario_result_id (str | None): If provided, restrict results to + a single scenario run. Defaults to ``None`` (all runs). + + Returns: + dict[str, AttackStats]: Stats per technique hash. Techniques with + no history are omitted from the result. + """ + + memory = CentralMemory.get_memory_instance() + results = memory.get_attack_results( + labels={label_key: list(technique_hashes)}, + scenario_result_id=scenario_result_id, + ) + + counts: dict[str, tuple[int, int, int, int]] = {} + for result in results: + technique = result.labels.get(label_key) + if not technique or technique not in technique_hashes: + continue + + s, f, u, e = counts.get(technique, (0, 0, 0, 0)) + if result.outcome == AttackOutcome.SUCCESS: + counts[technique] = (s + 1, f, u, e) + elif result.outcome == AttackOutcome.FAILURE: + counts[technique] = (s, f + 1, u, e) + elif result.outcome == AttackOutcome.ERROR: + counts[technique] = (s, f, u, e + 1) + else: + counts[technique] = (s, f, u + 1, e) + + stats: dict[str, AttackStats] = {} + for technique, (s, f, u, e) in counts.items(): + stats[technique] = _compute_stats(successes=s, failures=f, undetermined=u, errors=e) + return stats diff --git a/pyrit/scenario/scenarios/adaptive/__init__.py b/pyrit/scenario/scenarios/adaptive/__init__.py index 6ba741563..440e43a86 100644 --- a/pyrit/scenario/scenarios/adaptive/__init__.py +++ b/pyrit/scenario/scenarios/adaptive/__init__.py @@ -5,28 +5,22 @@ from pyrit.scenario.scenarios.adaptive.adaptive_scenario import AdaptiveScenario from pyrit.scenario.scenarios.adaptive.dispatcher import ( - ADAPTIVE_CONTEXT_LABEL, AdaptiveDispatchAttack, AdaptiveDispatchParams, ) from pyrit.scenario.scenarios.adaptive.selectors import ( - ContextExtractor, EpsilonGreedyTechniqueSelector, + SelectorScope, TechniqueSelector, - global_context, - harm_category_context, ) from pyrit.scenario.scenarios.adaptive.text_adaptive import TextAdaptive __all__ = [ - "ADAPTIVE_CONTEXT_LABEL", "AdaptiveDispatchAttack", "AdaptiveDispatchParams", "AdaptiveScenario", - "ContextExtractor", "EpsilonGreedyTechniqueSelector", + "SelectorScope", "TechniqueSelector", "TextAdaptive", - "global_context", - "harm_category_context", ] diff --git a/pyrit/scenario/scenarios/adaptive/adaptive_scenario.py b/pyrit/scenario/scenarios/adaptive/adaptive_scenario.py index f961071d1..15d5fa6ad 100644 --- a/pyrit/scenario/scenarios/adaptive/adaptive_scenario.py +++ b/pyrit/scenario/scenarios/adaptive/adaptive_scenario.py @@ -5,10 +5,10 @@ ``AdaptiveScenario`` — modality-agnostic base for scenarios that pick attack techniques per-objective using a ``TechniqueSelector``. -Owns selector wiring, dispatcher construction, per-dataset atomic-attack -emission, and resume rehydration. Concrete subclasses (``TextAdaptive``, -future ``ImageAdaptive`` / ``AudioAdaptive``) only declare strategy class, -default datasets, version, and atomic-attack prefix. +Owns selector wiring, dispatcher construction, and per-dataset atomic-attack +emission. Concrete subclasses (``TextAdaptive``, future ``ImageAdaptive`` / +``AudioAdaptive``) only declare strategy class, default datasets, version, +and atomic-attack prefix. Baseline policy is ``Enabled``: prompt_sending runs as a separate baseline comparison and is excluded from the adaptive technique pool. @@ -22,16 +22,14 @@ from pyrit.executor.attack import AttackScoringConfig from pyrit.scenario.core.atomic_attack import AtomicAttack from pyrit.scenario.core.attack_technique import AttackTechnique -from pyrit.scenario.core.scenario import BaselineAttackPolicy, Scenario +from pyrit.scenario.core.scenario import Scenario from pyrit.scenario.scenarios.adaptive.dispatcher import ( AdaptiveDispatchAttack, TechniqueBundle, ) from pyrit.scenario.scenarios.adaptive.selectors import ( - ContextExtractor, EpsilonGreedyTechniqueSelector, TechniqueSelector, - global_context, ) if TYPE_CHECKING: @@ -47,15 +45,12 @@ class AdaptiveScenario(Scenario): Abstract base for adaptive (epsilon-greedy) scenarios. Subclasses must implement the standard ``Scenario`` class-method overrides - and declare ``VERSION`` and ``_atomic_attack_prefix``. Selector wiring, - dispatcher construction, per-objective atomic-attack emission, and resume - rehydration are handled here. + and declare ``_atomic_attack_prefix``. Selector wiring + and dispatcher construction are handled here. """ - BASELINE_ATTACK_POLICY: ClassVar[BaselineAttackPolicy] = BaselineAttackPolicy.Enabled - - #: Subclasses must declare a scenario version for memory bookkeeping. - VERSION: ClassVar[int] + #: Scenario version for memory bookkeeping. + VERSION: ClassVar[int] = 1 #: Prefix for per-objective atomic-attack names (e.g. ``"adaptive_text"``). _atomic_attack_prefix: ClassVar[str] = "adaptive" @@ -64,7 +59,6 @@ def __init__( self, *, objective_scorer: TrueFalseScorer | None = None, - context_extractor: ContextExtractor = global_context, selector: TechniqueSelector | None = None, scenario_result_id: str | None = None, ) -> None: @@ -72,8 +66,6 @@ def __init__( Args: objective_scorer (TrueFalseScorer | None): Scorer used to judge each response. Defaults to the composite scorer from the base class. - context_extractor (ContextExtractor): Maps a ``SeedAttackGroup`` to a - context key. Defaults to ``global_context``. selector (TechniqueSelector | None): Pre-built selector. When ``None`` (default) an :class:`EpsilonGreedyTechniqueSelector` is created with default settings. @@ -83,7 +75,6 @@ def __init__( objective_scorer = self._get_default_objective_scorer() self._objective_scorer: TrueFalseScorer = objective_scorer - self._context_extractor = context_extractor self._custom_selector = selector super().__init__( @@ -105,14 +96,9 @@ async def _get_atomic_attacks_async(self) -> list[AtomicAttack]: dispatchers across all datasets share one ``TechniqueSelector`` instance so learning accumulates globally. - Seed groups whose objective is incompatible with every technique are - dropped up-front with a warning so the dispatcher never sees an empty - compatible pool at run time. - Returns: list[AtomicAttack]: One ``AtomicAttack`` per dataset that has at - least one compatible seed group. Empty if every seed group is - incompatible with every selected technique. + least one compatible seed group. Raises: ValueError: If ``self._objective_target`` is not set, or if @@ -124,10 +110,10 @@ async def _get_atomic_attacks_async(self) -> list[AtomicAttack]: techniques = self._build_techniques_dict(objective_target=self._objective_target) selector: TechniqueSelector = ( - self._custom_selector if self._custom_selector is not None else EpsilonGreedyTechniqueSelector() + self._custom_selector + if self._custom_selector is not None + else EpsilonGreedyTechniqueSelector() ) - # On resume, replay prior attempt outcomes from persisted metadata. - self._rehydrate_selector_from_memory(selector=selector, known_techniques=set(techniques)) seed_groups_by_dataset = self._dataset_config.get_seed_attack_groups() atomic_attacks: list[AtomicAttack] = [] @@ -149,14 +135,19 @@ def _build_techniques_dict( objective_target: PromptTarget, ) -> dict[str, TechniqueBundle]: """ - Resolve selected strategies into a ``{name: TechniqueBundle}`` map. + Resolve selected strategies into a ``{eval_hash: TechniqueBundle}`` map. Each bundle carries the inner attack strategy along with the factory's ``seed_technique`` and ``adversarial_chat`` so the dispatcher can reproduce the static ``AtomicAttack`` execution path per attempt. + Technique keys are eval hashes derived from the ``AttackTechnique`` + identity (strategy + seed_technique configuration). This allows the + selector and analytics to track techniques by their behavioral + configuration rather than by name alone. + Returns: - dict[str, TechniqueBundle]: Mapping from technique name to its + dict[str, TechniqueBundle]: Mapping from technique eval hash to its bundle, in the order selected strategies were resolved. Raises: @@ -179,8 +170,10 @@ def _build_techniques_dict( objective_target=objective_target, attack_scoring_config=scoring_config, ) - techniques[technique_name] = TechniqueBundle( + eval_hash = technique.get_identifier().hash + techniques[eval_hash] = TechniqueBundle( attack=technique.attack, + name=technique_name, seed_technique=technique.seed_technique, adversarial_chat=factory.adversarial_chat, ) @@ -245,9 +238,9 @@ def _build_atomic_for_dataset( objective_target=self._objective_target, techniques=techniques, selector=selector, - context_extractor=self._context_extractor, objective_scorer=self._objective_scorer, max_attempts_per_objective=self.params["max_attempts_per_objective"], + scenario_result_id=self._scenario_result_id, ) return AtomicAttack( @@ -259,63 +252,3 @@ def _build_atomic_for_dataset( display_group=dataset_name, ) - def _rehydrate_selector_from_memory( - self, - *, - selector: TechniqueSelector, - known_techniques: set[str], - ) -> None: - """ - Replay persisted dispatch trails into ``selector`` so resume - preserves learned state. - - Queries ``AttackResultEntry`` rows directly by ``scenario_result_id`` - (which selects on ``attribution_parent_id`` stamped at write time by - ``AtomicAttack``'s attribution path) and filters to rows belonging to - this scenario's adaptive atomic attacks via - ``attribution_data["parent_collection"]``. - - Args: - selector (TechniqueSelector): A freshly built selector to populate. - known_techniques (set[str]): Techniques available in the current run. - Trails referencing unknown techniques (e.g. after a strategies - change) are skipped so replay can't poison the table. - """ - if not self._scenario_result_id: - return - - # Narrow to errors a memory backend would plausibly raise (DB/IO - # failures, integrity issues). Programmer-level errors propagate. - try: - rows = self._memory.get_attack_results(scenario_result_id=self._scenario_result_id) - except (RuntimeError, OSError, ValueError) as exc: - logger.warning(f"AdaptiveScenario: failed to load prior attack results for rehydration: {exc}") - return - - adaptive_prefix = f"{self._atomic_attack_prefix}_" - replayed = 0 - for result in rows: - if result.attribution_data is None: - continue - collection = result.attribution_data.get("parent_collection") - if not collection or not collection.startswith(adaptive_prefix): - continue - metadata = result.metadata or {} - trail = metadata.get("adaptive_attempts") - context = metadata.get("adaptive_context") - if not trail or not context: - continue - for step in trail: - technique = step.get("technique") - outcome = step.get("outcome") - if not technique or technique not in known_techniques: - continue - selector.record_outcome( - context=context, - technique=technique, - success=outcome == "success", - ) - replayed += 1 - - if replayed: - logger.info(f"AdaptiveScenario: rehydrated selector with {replayed} prior attempt(s).") diff --git a/pyrit/scenario/scenarios/adaptive/dispatcher.py b/pyrit/scenario/scenarios/adaptive/dispatcher.py index cd26a3b36..6a49cacaf 100644 --- a/pyrit/scenario/scenarios/adaptive/dispatcher.py +++ b/pyrit/scenario/scenarios/adaptive/dispatcher.py @@ -2,17 +2,12 @@ # Licensed under the MIT license. """ -``AdaptiveDispatchAttack`` — picks an inner technique per attempt via a -``TechniqueSelector``, runs it, records the outcome, and loops up to -``max_attempts_per_objective`` times. - -The dispatcher is shared across all seed groups in an enclosing -``AtomicAttack`` and reads the per-call ``SeedAttackGroup`` from -``AdaptiveDispatchParams.seed_group`` (populated by -``AdaptiveDispatchParams.from_seed_group_async``). It computes the per-call -adaptive context key via the injected ``ContextExtractor`` and merges each -chosen technique's ``seed_technique`` (when present) into the seed group -before delegating execution to ``AttackExecutor``. +``AdaptiveDispatchAttack`` — picks inner techniques per objective via a +``TechniqueSelector``, runs them in priority order, and stops on success. + +The selector is stateless and async: it queries memory for historical +success rates. The dispatcher pre-selects up to ``max_attempts_per_objective`` +techniques at the start of each objective, then iterates through them. """ from __future__ import annotations @@ -29,10 +24,9 @@ from pyrit.executor.attack.core.attack_strategy import AttackContext, AttackStrategy from pyrit.models import AttackOutcome, AttackResult, SeedAttackGroup from pyrit.scenario.scenarios.adaptive.selectors import ( - ContextExtractor, TechniqueSelector, - global_context, ) +from pyrit.scenario.scenarios.adaptive.selectors.technique_selector import ADAPTIVE_TECHNIQUE_LABEL if TYPE_CHECKING: from pyrit.models import SeedAttackTechniqueGroup @@ -43,12 +37,7 @@ # Memory-label keys stamped onto persisted prompt rows so adaptive attempts -# can be filtered/grouped after a run. The dispatcher stamps all three on -# each attempt (context derived per-call from the seed group). -ADAPTIVE_CONTEXT_LABEL: str = "_adaptive_context" -"""Per-objective context key (e.g. ``"_global"`` or a harm category).""" -ADAPTIVE_TECHNIQUE_LABEL: str = "_adaptive_technique" -"""Technique chosen by the dispatcher for a given attempt.""" +# can be filtered/grouped after a run. ADAPTIVE_ATTEMPT_LABEL: str = "_adaptive_attempt" """1-based attempt index within the per-objective loop.""" @@ -64,6 +53,7 @@ class TechniqueBundle: """ attack: AttackStrategy[Any, AttackResult] + name: str = "" seed_technique: SeedAttackTechniqueGroup | None = None adversarial_chat: PromptTarget | None = None @@ -159,23 +149,22 @@ def __init__( objective_target: PromptTarget, techniques: dict[str, TechniqueBundle], selector: TechniqueSelector, - context_extractor: ContextExtractor = global_context, objective_scorer: TrueFalseScorer | None = None, max_attempts_per_objective: int = 3, + scenario_result_id: str | None = None, ) -> None: """ Args: objective_target (PromptTarget): The target inner attacks run against. - Stored for identifier/logging parity; not called directly. - techniques (dict[str, TechniqueBundle]): Mapping from technique name to - its bundle (attack, seed_technique, adversarial_chat). Must be non-empty. - selector (TechniqueSelector): Shared selector state. - context_extractor (ContextExtractor): Maps a per-call ``SeedAttackGroup`` to - the adaptive context key used by the selector. Defaults to ``global_context``. + techniques (dict[str, TechniqueBundle]): Mapping from technique eval hash to + its bundle (attack, name, seed_technique, adversarial_chat). Must be non-empty. + selector (TechniqueSelector): Stateless technique selector. objective_scorer (TrueFalseScorer | None): Scorer passed through to techniques that generate simulated conversations. max_attempts_per_objective (int): Max attempts per objective; >= 1. Defaults to 3. + scenario_result_id (str | None): If provided, passed to the selector + to scope memory queries to this scenario run. Defaults to ``None``. Raises: ValueError: If ``techniques`` is empty or ``max_attempts_per_objective`` < 1. @@ -193,12 +182,9 @@ def __init__( ) self._techniques = techniques self._selector = selector - self._context_extractor = context_extractor self._objective_scorer = objective_scorer self._max_attempts = max_attempts_per_objective - # Attempts are inherently sequential (each one reads the selector - # state updated by the previous), so a single shared executor with - # ``max_concurrency=1`` is reused across attempts. + self._scenario_result_id = scenario_result_id self._executor = AttackExecutor(max_concurrency=1) def _validate_context(self, *, context: AdaptiveDispatchContext) -> None: @@ -268,12 +254,9 @@ async def _perform_async(self, *, context: AdaptiveDispatchContext) -> AttackRes """ Run the per-objective adaptive loop. - Reads the per-call ``SeedAttackGroup`` from ``context.params.seed_group``, - derives the adaptive context key via the injected ``ContextExtractor``, - and filters the technique pool to those whose ``seed_technique`` is - compatible with this seed group. Then loops up to - ``max_attempts_per_objective`` times: select a technique, execute it, - record the outcome, and stop early on success. + Pre-selects up to ``max_attempts_per_objective`` techniques via the + stateless selector, then iterates in priority order. Stops early on + success. Args: context (AdaptiveDispatchContext): Execution context whose @@ -281,8 +264,7 @@ async def _perform_async(self, *, context: AdaptiveDispatchContext) -> AttackRes Returns: AttackResult: A fresh dispatcher-owned copy of the final inner - result with the dispatch trail stamped onto ``metadata`` - (see class docstring for the two-row persistence note). + result with the dispatch trail stamped onto ``metadata``. Raises: ValueError: If ``context.params.seed_group`` is missing, or if no @@ -308,55 +290,44 @@ async def _perform_async(self, *, context: AdaptiveDispatchContext) -> AttackRes f"(objective={seed_group.objective.value!r})." ) - adaptive_context = self._context_extractor(seed_group) + chosen_techniques = await self._selector.select_async( + technique_identifiers=compatible_names, + objective=context.objective, + num_top_techniques=self._max_attempts, + scenario_result_id=self._scenario_result_id, + ) last_result: AttackResult | None = None trail: list[dict[str, str]] = [] - for attempt_idx in range(self._max_attempts): - decision_key = f"{context.objective}:{attempt_idx}" - chosen = self._selector.select( - context=adaptive_context, - techniques=compatible_names, - decision_key=decision_key, - ) + for attempt_idx, chosen in enumerate(chosen_techniques): bundle = self._techniques[chosen] attempt_labels = { **context.memory_labels, - ADAPTIVE_CONTEXT_LABEL: adaptive_context, ADAPTIVE_TECHNIQUE_LABEL: chosen, ADAPTIVE_ATTEMPT_LABEL: str(attempt_idx + 1), } logger.debug( - "AdaptiveDispatchAttack: attempt %d/%d context=%r technique=%r", + "AdaptiveDispatchAttack: attempt %d/%d technique=%r (hash=%s)", attempt_idx + 1, - self._max_attempts, - adaptive_context, + len(chosen_techniques), + bundle.name, chosen, ) result = await self._run_inner_attack_async( bundle=bundle, seed_group=seed_group, attempt_labels=attempt_labels ) - success = result.outcome == AttackOutcome.SUCCESS - self._selector.record_outcome(context=adaptive_context, technique=chosen, success=success) - trail.append({"technique": chosen, "outcome": result.outcome.value}) + trail.append({"technique": bundle.name, "technique_hash": chosen, "outcome": result.outcome.value}) last_result = result - if success: + if result.outcome == AttackOutcome.SUCCESS: break - # ``max_attempts`` is validated >= 1, so the loop always runs at least - # once. Guard explicitly rather than with ``assert`` (stripped under -O). if last_result is None: # pragma: no cover - defensive raise RuntimeError("AdaptiveDispatchAttack ran zero attempts; this should be unreachable.") - # Return a fresh dispatcher-owned ``AttackResult``: the inner attack - # already persisted ``last_result`` via its own post-execute hook, so - # returning it directly would cause a PK conflict on the outer hook. - # ``dataclasses.replace`` copies every field; we override identity - # fields and stamp the trail onto metadata. return replace( last_result, attack_result_id=str(uuid.uuid4()), @@ -364,6 +335,5 @@ async def _perform_async(self, *, context: AdaptiveDispatchContext) -> AttackRes metadata={ **last_result.metadata, "adaptive_attempts": trail, - "adaptive_context": adaptive_context, }, ) diff --git a/pyrit/scenario/scenarios/adaptive/selectors/__init__.py b/pyrit/scenario/scenarios/adaptive/selectors/__init__.py index 7e97f1940..9fe4c2d3c 100644 --- a/pyrit/scenario/scenarios/adaptive/selectors/__init__.py +++ b/pyrit/scenario/scenarios/adaptive/selectors/__init__.py @@ -1,26 +1,20 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -"""Selector protocol, context extractors, and selector implementations.""" +"""Selector protocol and selector implementations.""" from pyrit.scenario.scenarios.adaptive.selectors.epsilon_greedy import ( EpsilonGreedyTechniqueSelector, ) -from pyrit.scenario.scenarios.adaptive.selectors.protocol import ( - GLOBAL_CONTEXT, - UNCATEGORIZED_CONTEXT, - ContextExtractor, +from pyrit.scenario.scenarios.adaptive.selectors.technique_selector import ( + ADAPTIVE_TECHNIQUE_LABEL, + SelectorScope, TechniqueSelector, - global_context, - harm_category_context, ) __all__ = [ - "ContextExtractor", + "ADAPTIVE_TECHNIQUE_LABEL", "EpsilonGreedyTechniqueSelector", - "GLOBAL_CONTEXT", + "SelectorScope", "TechniqueSelector", - "UNCATEGORIZED_CONTEXT", - "global_context", - "harm_category_context", ] diff --git a/pyrit/scenario/scenarios/adaptive/selectors/epsilon_greedy.py b/pyrit/scenario/scenarios/adaptive/selectors/epsilon_greedy.py index 9ff7e9f6b..662ff3cfb 100644 --- a/pyrit/scenario/scenarios/adaptive/selectors/epsilon_greedy.py +++ b/pyrit/scenario/scenarios/adaptive/selectors/epsilon_greedy.py @@ -6,18 +6,21 @@ from __future__ import annotations import hashlib +import logging import random import struct -import threading -from typing import TYPE_CHECKING +from collections.abc import Sequence -if TYPE_CHECKING: - from collections.abc import Sequence +from pyrit.analytics.result_analysis import AttackStats +from pyrit.analytics.scenario_analysis import compute_technique_success_rates +from pyrit.scenario.scenarios.adaptive.selectors.technique_selector import ADAPTIVE_TECHNIQUE_LABEL, SelectorScope +logger = logging.getLogger(__name__) -def _derive_rng(random_seed: int | None, context: str, decision_key: str) -> random.Random: + +def _derive_rng(random_seed: int | None, decision_key: str) -> random.Random: """ - Derive a per-decision ``Random`` from ``(random_seed, context, decision_key)``. + Derive a per-decision ``Random`` from ``(random_seed, decision_key)``. Returns: random.Random: A fresh ``random.Random`` seeded deterministically from the @@ -25,158 +28,131 @@ def _derive_rng(random_seed: int | None, context: str, decision_key: str) -> ran """ if random_seed is None: return random.Random() - digest = hashlib.sha256(f"{random_seed}|{context}|{decision_key}".encode()).digest() + digest = hashlib.sha256(f"{random_seed}|{decision_key}".encode()).digest() derived_seed = struct.unpack(" (successes, attempts)`` table. With - probability ``epsilon`` picks uniformly at random; otherwise picks the - technique with the highest Laplace-smoothed estimate ``(s + 1) / (n + 1)`` - (unseen techniques start at 1.0). A ``(context, technique)`` cell with - fewer than ``pool_threshold`` attempts falls back to the technique's - pooled rate across all contexts. - - Each ``select`` call derives a per-decision ``Random`` from - ``(random_seed, context, decision_key)`` so that resume produces deterministic - decisions without persisting RNG state. - - All public methods are guarded by a ``threading.Lock`` so concurrent - callers cannot corrupt the table. The lock makes individual ops atomic, - not the overall select → execute → record sequence. + Stateless epsilon-greedy selector over attack techniques. + + Queries memory for historical success rates and applies epsilon-greedy + selection. With probability ``epsilon`` picks uniformly at random; + otherwise picks the technique with the highest Laplace-smoothed estimate + ``(s + 1) / (n + 1)`` (unseen techniques start at 1.0). + + The selector is **stateless** — it does not maintain internal counts. + All outcome data comes from the memory database via + ``_compute_success_rates``. Calling ``select_async`` with the same + arguments produces the same result (deterministic given memory + contents and ``random_seed``). """ - # Tolerance for tiebreaking on float estimates (current estimates are exact - # rationals; this guards against future estimator changes). _TIE_TOL: float = 1e-12 def __init__( self, *, epsilon: float = 0.2, - pool_threshold: int = 3, + scope: SelectorScope = SelectorScope.ALL_RUNS, random_seed: int | None = None, ) -> None: """ Args: epsilon (float): Exploration probability in [0.0, 1.0]. Defaults to 0.2. - pool_threshold (int): Minimum per-(context, technique) attempts before - the local estimate replaces the pooled rate. Must be >= 1; set to 1 - to disable pooling. Defaults to 3. - random_seed (int | None): Base seed for deterministic per-decision RNG derivation. - Defaults to ``None`` (non-deterministic). + scope (SelectorScope): Whether to use all historical data or only + the current scenario run. Defaults to ``SelectorScope.ALL_RUNS``. + random_seed (int | None): Base seed for deterministic per-decision RNG + derivation. Defaults to ``None`` (non-deterministic). Raises: - ValueError: If ``epsilon`` is outside [0.0, 1.0] or ``pool_threshold`` < 1. + ValueError: If ``epsilon`` is outside [0.0, 1.0]. """ if not 0.0 <= epsilon <= 1.0: raise ValueError(f"epsilon must be in [0.0, 1.0], got {epsilon}") - if pool_threshold < 1: - raise ValueError(f"pool_threshold must be >= 1, got {pool_threshold}") self._epsilon = epsilon - self._pool_threshold = pool_threshold + self._scope = scope self._seed = random_seed - self._counts: dict[tuple[str, str], tuple[int, int]] = {} - # Per-technique pooled counts, kept in sync with ``_counts`` so the - # pooled-backoff branch in ``_estimate`` is O(1). - self._global_counts: dict[str, tuple[int, int]] = {} - # Monotonic counter for auto-generating decision keys when the caller - # doesn't provide one. - self._decision_counter: int = 0 - # Guards _counts, _global_counts, and _decision_counter against concurrent callers. - self._lock = threading.Lock() - - def select(self, *, context: str, techniques: Sequence[str], decision_key: str = "") -> str: + + async def select_async( + self, + *, + technique_identifiers: Sequence[str], + objective: str, + num_top_techniques: int = 1, + scenario_result_id: str | None = None, + ) -> Sequence[str]: """ - Pick the next technique to try for ``context``. + Return up to ``num_top_techniques`` techniques in priority order. Args: - context (str): The context key. - techniques (Sequence[str]): Candidate technique names. - decision_key (str): Caller-supplied key (e.g. ``"obj_id:attempt_idx"``) - used to derive a per-decision RNG for deterministic replay. - Defaults to ``""`` (auto-incremented counter). + technique_identifiers (Sequence[str]): Available technique names. + objective (str): The objective text for scoping the per-decision RNG. + num_top_techniques (int): Max techniques to return. Defaults to 1. + scenario_result_id (str | None): If provided, restrict memory + queries to this scenario run. Defaults to ``None`` (all runs). Returns: - str: The chosen technique name. + Sequence[str]: Techniques in priority order. Fewer than + ``num_top_techniques`` if not enough techniques are available. Raises: - ValueError: If ``techniques`` is empty. + ValueError: If ``technique_identifiers`` is empty. """ - technique_list = list(techniques) + technique_list = list(technique_identifiers) if not technique_list: - raise ValueError("techniques must contain at least one entry") + raise ValueError("technique_identifiers must contain at least one entry") - with self._lock: - if decision_key: - effective_key = decision_key - else: - effective_key = str(self._decision_counter) - self._decision_counter += 1 - rng = _derive_rng(self._seed, context, effective_key) + num_top_techniques = min(num_top_techniques, len(technique_list)) + + decision_key = objective + rng = _derive_rng(self._seed, decision_key) + + stats = compute_technique_success_rates( + technique_hashes=technique_list, + label_key=ADAPTIVE_TECHNIQUE_LABEL, + scenario_result_id=scenario_result_id if self._scope == SelectorScope.CURRENT_RUN else None, + ) + + chosen: list[str] = [] + remaining = list(technique_list) + + for _ in range(num_top_techniques): + if not remaining: + break if rng.random() < self._epsilon: - return rng.choice(technique_list) + pick = rng.choice(remaining) + else: + estimates = { + t: self._estimate(technique=t, stats=stats) for t in remaining + } + best = max(estimates.values()) + winners = [t for t, v in estimates.items() if v >= best - self._TIE_TOL] + pick = rng.choice(winners) - estimates = {t: self._estimate(context=context, technique=t) for t in technique_list} - best = max(estimates.values()) - winners = [t for t, value in estimates.items() if value >= best - self._TIE_TOL] - return rng.choice(winners) + chosen.append(pick) + remaining.remove(pick) - def record_outcome(self, *, context: str, technique: str, success: bool) -> None: - """ - Record the outcome of an attempt. + return chosen - Args: - context (str): The context key the decision was made under. - technique (str): The technique that was tried. - success (bool): Whether the attempt succeeded. + @staticmethod + def _estimate(*, technique: str, stats: dict[str, AttackStats]) -> float: """ - with self._lock: - successes, attempts = self._counts.get((context, technique), (0, 0)) - attempts += 1 - if success: - successes += 1 - self._counts[(context, technique)] = (successes, attempts) - - global_successes, global_attempts = self._global_counts.get(technique, (0, 0)) - global_attempts += 1 - if success: - global_successes += 1 - self._global_counts[technique] = (global_successes, global_attempts) - - def success_rate(self, *, context: str, technique: str) -> float: - """Return the Laplace-smoothed estimate ``(s + 1) / (n + 1)`` used for exploitation.""" - with self._lock: - return self._estimate(context=context, technique=technique) - - def counts(self, *, context: str, technique: str) -> tuple[int, int]: - """Return raw ``(successes, attempts)`` for a ``(context, technique)`` cell.""" - with self._lock: - return self._counts.get((context, technique), (0, 0)) - - def snapshot(self) -> dict[tuple[str, str], tuple[int, int]]: - """Return a shallow copy of the full counts table (for logging/debug).""" - with self._lock: - return dict(self._counts) - - def _estimate(self, *, context: str, technique: str) -> float: - """ - Estimate for ``(context, technique)``; falls back to pooled rate below - ``pool_threshold`` local attempts. + Laplace-smoothed success-rate estimate for a technique. + + Unseen techniques get ``(0 + 1) / (0 + 1) = 1.0`` (optimistic init). - Callers must already hold ``self._lock``. + Args: + technique (str): The technique name. + stats (dict[str, AttackStats]): Pre-computed stats from memory. Returns: - float: Laplace-smoothed success-rate estimate in ``(0, 1)``. + float: Estimated success rate in ``(0, 1]``. """ - local_s, local_n = self._counts.get((context, technique), (0, 0)) - if local_n >= self._pool_threshold: - return (local_s + 1) / (local_n + 1) - global_s, global_n = self._global_counts.get(technique, (0, 0)) - return (global_s + 1) / (global_n + 1) + technique_stats = stats.get(technique) + if technique_stats is None or technique_stats.total_decided == 0: + return 1.0 + return (technique_stats.successes + 1) / (technique_stats.total_decided + 1) diff --git a/pyrit/scenario/scenarios/adaptive/selectors/protocol.py b/pyrit/scenario/scenarios/adaptive/selectors/protocol.py deleted file mode 100644 index e8c8c640f..000000000 --- a/pyrit/scenario/scenarios/adaptive/selectors/protocol.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -"""Selector protocol and context extractors for adaptive scenarios.""" - -from __future__ import annotations - -from collections.abc import Callable, Sequence -from typing import TYPE_CHECKING, Protocol, runtime_checkable - -if TYPE_CHECKING: - from pyrit.models.seeds.seed_attack_group import SeedAttackGroup - -ContextExtractor = Callable[["SeedAttackGroup"], str] -"""Maps a ``SeedAttackGroup`` to an adaptive context key.""" - -GLOBAL_CONTEXT: str = "_global" -"""Default context: all objectives share one selection table.""" - -UNCATEGORIZED_CONTEXT: str = "_uncategorized" -"""Fallback context for seed groups with no harm category metadata.""" - - -def global_context(_seed_attack_group: SeedAttackGroup) -> str: - """ - Return a single shared context for all objectives. - - Returns: - str: Always :data:`GLOBAL_CONTEXT`. - """ - return GLOBAL_CONTEXT - - -def harm_category_context(seed_attack_group: SeedAttackGroup) -> str: - """ - Return a context keyed by the sorted, ``|``-joined harm categories. - - Multi-category seeds form their own bucket; sorting makes the key deterministic. - - Returns: - str: The ``|``-joined sorted harm categories, or :data:`UNCATEGORIZED_CONTEXT` - when the seed group has no categories. - """ - categories = seed_attack_group.harm_categories - if not categories: - return UNCATEGORIZED_CONTEXT - return "|".join(sorted(categories)) - - -@runtime_checkable -class TechniqueSelector(Protocol): - """ - Protocol for adaptive technique selectors. - - Any object implementing ``select`` and ``record_outcome`` can serve as - the selector for an ``AdaptiveScenario``. The epsilon-greedy - implementation (:class:`EpsilonGreedyTechniqueSelector`) is the default. - """ - - def select(self, *, context: str, techniques: Sequence[str], decision_key: str = "") -> str: - """Pick the next technique to try for ``context``.""" - ... # pragma: no cover - - def record_outcome(self, *, context: str, technique: str, success: bool) -> None: - """Record the outcome of an attempt.""" - ... # pragma: no cover diff --git a/pyrit/scenario/scenarios/adaptive/selectors/technique_selector.py b/pyrit/scenario/scenarios/adaptive/selectors/technique_selector.py new file mode 100644 index 000000000..d87adb1a2 --- /dev/null +++ b/pyrit/scenario/scenarios/adaptive/selectors/technique_selector.py @@ -0,0 +1,64 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Technique selector protocol for adaptive scenarios.""" + +from __future__ import annotations + +from collections.abc import Sequence +from enum import Enum +from typing import Protocol, runtime_checkable + + +# TODO: probably want to expand this to allow for more filtering options +# (e.g. filter by scenario parameters, attack labels, etc.) +class SelectorScope(str, Enum): + """Controls which historical data a selector queries.""" + + ALL_RUNS = "all_runs" + """Use technique success rates from all historical scenario runs.""" + + CURRENT_RUN = "current_run" + """Use technique success rates only from the current scenario run.""" + + +ADAPTIVE_TECHNIQUE_LABEL: str = "_adaptive_technique" +"""Memory-label key the dispatcher stamps on each attack result to record +which technique was used.""" + + +@runtime_checkable +class TechniqueSelector(Protocol): + """ + Protocol for adaptive technique selectors. + + Selectors are **stateless** — they query memory for historical success + rates rather than maintaining internal counts. Calling ``select_async`` + with the same arguments twice should yield the same answer + (deterministic given memory contents). + """ + + async def select_async( + self, + *, + technique_identifiers: Sequence[str], + objective: str, + num_top_techniques: int = 1, + scenario_result_id: str | None = None, + ) -> Sequence[str]: + """ + Return techniques in priority order (try first, try second, …). + + Args: + technique_identifiers (Sequence[str]): Available technique names. + objective (str): The objective text for this selection. + num_top_techniques (int): Max techniques to return. Defaults to 1. + scenario_result_id (str | None): The current scenario run ID, + provided by the dispatcher. Selectors use this when their + scope is ``SelectorScope.CURRENT_RUN``. + + Returns: + Sequence[str]: Up to ``num_top_techniques`` technique names in + priority order. Fewer if not enough techniques are available. + """ + ... # pragma: no cover diff --git a/pyrit/scenario/scenarios/adaptive/text_adaptive.py b/pyrit/scenario/scenarios/adaptive/text_adaptive.py index c1d1e588a..a9fb12a03 100644 --- a/pyrit/scenario/scenarios/adaptive/text_adaptive.py +++ b/pyrit/scenario/scenarios/adaptive/text_adaptive.py @@ -22,9 +22,7 @@ from pyrit.scenario.core.dataset_configuration import DatasetConfiguration from pyrit.scenario.scenarios.adaptive.adaptive_scenario import AdaptiveScenario from pyrit.scenario.scenarios.adaptive.selectors import ( - ContextExtractor, TechniqueSelector, - global_context, ) if TYPE_CHECKING: @@ -73,8 +71,6 @@ class TextAdaptive(AdaptiveScenario): comparison and is excluded from the adaptive technique pool. """ - VERSION: int = 1 - _atomic_attack_prefix: ClassVar[str] = "adaptive" _cached_strategy_class: ClassVar[type[ScenarioStrategy] | None] = None @classmethod @@ -130,7 +126,6 @@ def __init__( self, *, objective_scorer: TrueFalseScorer | None = None, - context_extractor: ContextExtractor = global_context, selector: TechniqueSelector | None = None, scenario_result_id: str | None = None, ) -> None: @@ -138,18 +133,14 @@ def __init__( Args: objective_scorer (TrueFalseScorer | None): Scorer used to judge each response. Defaults to the composite scorer from the base class. - context_extractor (ContextExtractor): Maps a ``SeedAttackGroup`` to a - context key. Defaults to ``global_context``. Use - ``harm_category_context`` to partition by harm category. selector (TechniqueSelector | None): Pre-built selector. When ``None`` (default) an :class:`EpsilonGreedyTechniqueSelector` is created with default settings. Pass a custom instance to tune - ``epsilon``, ``pool_threshold``, or ``random_seed``. + ``epsilon`` or ``random_seed``. scenario_result_id (str | None): ID of an existing ``ScenarioResult`` to resume. """ super().__init__( objective_scorer=objective_scorer, - context_extractor=context_extractor, selector=selector, scenario_result_id=scenario_result_id, ) diff --git a/tests/unit/analytics/test_scenario_analysis.py b/tests/unit/analytics/test_scenario_analysis.py new file mode 100644 index 000000000..0678f89df --- /dev/null +++ b/tests/unit/analytics/test_scenario_analysis.py @@ -0,0 +1,111 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +from unittest.mock import MagicMock, patch + +import pytest + +from pyrit.analytics.scenario_analysis import compute_technique_success_rates +from pyrit.models import AttackOutcome + + +LABEL_KEY = "_adaptive_technique" + + +def _make_result(*, technique: str, outcome: AttackOutcome) -> MagicMock: + r = MagicMock() + r.labels = {LABEL_KEY: technique} + r.outcome = outcome + return r + + +@pytest.fixture(autouse=True) +def _patch_memory(): + mock_memory = MagicMock() + mock_memory.get_attack_results.return_value = [] + with patch("pyrit.memory.CentralMemory") as cm: + cm.get_memory_instance.return_value = mock_memory + yield mock_memory + + +class TestComputeTechniqueSuccessRates: + + def test_empty_results_returns_empty(self, _patch_memory): + stats = compute_technique_success_rates(technique_hashes=["a", "b"], label_key=LABEL_KEY) + assert stats == {} + + def test_counts_successes_and_failures(self, _patch_memory): + _patch_memory.get_attack_results.return_value = [ + _make_result(technique="a", outcome=AttackOutcome.SUCCESS), + _make_result(technique="a", outcome=AttackOutcome.SUCCESS), + _make_result(technique="a", outcome=AttackOutcome.FAILURE), + _make_result(technique="b", outcome=AttackOutcome.FAILURE), + ] + + stats = compute_technique_success_rates(technique_hashes=["a", "b"], label_key=LABEL_KEY) + + assert stats["a"].successes == 2 + assert stats["a"].failures == 1 + assert stats["a"].total_decided == 3 + assert stats["b"].successes == 0 + assert stats["b"].failures == 1 + + def test_counts_errors_and_undetermined(self, _patch_memory): + _patch_memory.get_attack_results.return_value = [ + _make_result(technique="a", outcome=AttackOutcome.ERROR), + _make_result(technique="a", outcome=AttackOutcome.UNDETERMINED), + ] + + stats = compute_technique_success_rates(technique_hashes=["a"], label_key=LABEL_KEY) + + assert stats["a"].errors == 1 + assert stats["a"].undetermined == 1 + + def test_ignores_techniques_not_in_requested_list(self, _patch_memory): + _patch_memory.get_attack_results.return_value = [ + _make_result(technique="a", outcome=AttackOutcome.SUCCESS), + _make_result(technique="c", outcome=AttackOutcome.SUCCESS), + ] + + stats = compute_technique_success_rates(technique_hashes=["a", "b"], label_key=LABEL_KEY) + + assert "a" in stats + assert "c" not in stats + + def test_passes_label_key_to_memory_query(self, _patch_memory): + custom_key = "my_custom_key" + compute_technique_success_rates(technique_hashes=["x"], label_key=custom_key) + + call_kwargs = _patch_memory.get_attack_results.call_args[1] + assert call_kwargs["labels"] == {custom_key: ["x"]} + assert call_kwargs["scenario_result_id"] is None + + def test_passes_scenario_result_id_to_memory_query(self, _patch_memory): + compute_technique_success_rates( + technique_hashes=["x"], label_key=LABEL_KEY, scenario_result_id="run-123" + ) + + call_kwargs = _patch_memory.get_attack_results.call_args[1] + assert call_kwargs["scenario_result_id"] == "run-123" + + def test_omits_techniques_with_no_history(self, _patch_memory): + _patch_memory.get_attack_results.return_value = [ + _make_result(technique="a", outcome=AttackOutcome.SUCCESS), + ] + + stats = compute_technique_success_rates(technique_hashes=["a", "b"], label_key=LABEL_KEY) + + assert "a" in stats + assert "b" not in stats + + def test_success_rate_computed(self, _patch_memory): + _patch_memory.get_attack_results.return_value = [ + _make_result(technique="a", outcome=AttackOutcome.SUCCESS), + _make_result(technique="a", outcome=AttackOutcome.SUCCESS), + _make_result(technique="a", outcome=AttackOutcome.FAILURE), + _make_result(technique="a", outcome=AttackOutcome.FAILURE), + ] + + stats = compute_technique_success_rates(technique_hashes=["a"], label_key=LABEL_KEY) + + assert stats["a"].success_rate == pytest.approx(0.5) diff --git a/tests/unit/scenario/scenarios/adaptive/test_dispatcher.py b/tests/unit/scenario/scenarios/adaptive/test_dispatcher.py index 963d6bd63..9c6ba29fb 100644 --- a/tests/unit/scenario/scenarios/adaptive/test_dispatcher.py +++ b/tests/unit/scenario/scenarios/adaptive/test_dispatcher.py @@ -15,22 +15,16 @@ TechniqueBundle, ) from pyrit.scenario.scenarios.adaptive.selectors import ( - GLOBAL_CONTEXT, EpsilonGreedyTechniqueSelector, - harm_category_context, ) def _make_bundle(*, name: str, outcomes: list[AttackOutcome], seed_technique=None) -> TechniqueBundle: - """Build a TechniqueBundle whose attack stub yields the given outcomes in order. - - The dispatcher routes execution through ``_run_inner_attack_async``; tests - patch that method directly so we only need a placeholder attack here. - """ + """Build a TechniqueBundle whose attack stub yields the given outcomes in order.""" attack = MagicMock(name=f"attack-{name}") attack._outcomes = outcomes attack._name = name - return TechniqueBundle(attack=attack, seed_technique=seed_technique) + return TechniqueBundle(attack=attack, name=name, seed_technique=seed_technique) def _make_context( @@ -56,12 +50,7 @@ def _patch_inner( dispatcher: AdaptiveDispatchAttack, bundles: dict[str, TechniqueBundle], ) -> AsyncMock: - """Replace ``_run_inner_attack_async`` with a stub backed by per-bundle outcomes. - - Returns the AsyncMock so tests can introspect call history (kwargs include - ``bundle`` and ``attempt_labels``). - """ - # Each call consumes one outcome from the chosen bundle's deque. + """Replace ``_run_inner_attack_async`` with a stub backed by per-bundle outcomes.""" name_for_attack = {id(b.attack): name for name, b in bundles.items()} counters: dict[str, int] = dict.fromkeys(bundles, 0) @@ -81,10 +70,26 @@ async def _stub(*, bundle: TechniqueBundle, seed_group, attempt_labels: dict[str return inner_mock +class _StubSelector: + """A deterministic selector stub that returns techniques in the order given.""" + + def __init__(self, *, technique_order: list[str]): + self._order = technique_order + + async def select_async( + self, + *, + technique_identifiers, + objective: str, + num_top_techniques: int = 1, + scenario_result_id: str | None = None, + ): + return self._order[:num_top_techniques] + + @pytest.fixture -def selector() -> EpsilonGreedyTechniqueSelector: - # epsilon=0 makes selection deterministic given the table. - return EpsilonGreedyTechniqueSelector(epsilon=0.0, pool_threshold=1, random_seed=0) +def selector(): + return _StubSelector(technique_order=["a", "b", "c"]) @pytest.fixture @@ -105,6 +110,8 @@ def test_init_rejects_empty_techniques(self, target, selector, seed_group): objective_target=target, techniques={}, selector=selector, + + ) @pytest.mark.parametrize("bad_max", [0, -1]) @@ -115,21 +122,26 @@ def test_init_rejects_invalid_max_attempts(self, target, selector, seed_group, b objective_target=target, techniques={"a": _make_bundle(name="a", outcomes=[AttackOutcome.SUCCESS])}, selector=selector, + + max_attempts_per_objective=bad_max, ) @pytest.mark.usefixtures("patch_central_database") class TestPerform: - async def test_stops_on_first_success(self, target, selector, seed_group): + async def test_stops_on_first_success(self, target, seed_group): bundles = { "a": _make_bundle(name="a", outcomes=[AttackOutcome.SUCCESS]), "b": _make_bundle(name="b", outcomes=[AttackOutcome.SUCCESS]), } + selector = _StubSelector(technique_order=["a", "b"]) dispatcher = AdaptiveDispatchAttack( objective_target=target, techniques=bundles, selector=selector, + + max_attempts_per_objective=5, ) inner = _patch_inner(dispatcher=dispatcher, bundles=bundles) @@ -139,15 +151,19 @@ async def test_stops_on_first_success(self, target, selector, seed_group): assert result.outcome == AttackOutcome.SUCCESS assert inner.call_count == 1 - async def test_retries_until_max_attempts_on_failure(self, target, selector, seed_group): + async def test_retries_until_max_attempts_on_failure(self, target, seed_group): bundles = { "a": _make_bundle(name="a", outcomes=[AttackOutcome.FAILURE] * 3), "b": _make_bundle(name="b", outcomes=[AttackOutcome.FAILURE] * 3), + "c": _make_bundle(name="c", outcomes=[AttackOutcome.FAILURE] * 3), } + selector = _StubSelector(technique_order=["a", "b", "c"]) dispatcher = AdaptiveDispatchAttack( objective_target=target, techniques=bundles, selector=selector, + + max_attempts_per_objective=3, ) inner = _patch_inner(dispatcher=dispatcher, bundles=bundles) @@ -157,84 +173,37 @@ async def test_retries_until_max_attempts_on_failure(self, target, selector, see assert result.outcome == AttackOutcome.FAILURE assert inner.call_count == 3 - async def test_updates_selector_on_each_attempt(self, target, selector, seed_group): - bundles = { - "a": _make_bundle(name="a", outcomes=[AttackOutcome.FAILURE, AttackOutcome.SUCCESS]), - "b": _make_bundle(name="b", outcomes=[AttackOutcome.SUCCESS]), - } + async def test_passes_attempt_labels_to_inner(self, target, seed_group): + bundles = {"a": _make_bundle(name="a", outcomes=[AttackOutcome.SUCCESS])} + selector = _StubSelector(technique_order=["a"]) dispatcher = AdaptiveDispatchAttack( objective_target=target, techniques=bundles, selector=selector, - max_attempts_per_objective=3, - ) - inner = _patch_inner(dispatcher=dispatcher, bundles=bundles) - - await dispatcher._perform_async(context=_make_context()) - total_attempts = sum(selector.counts(context=GLOBAL_CONTEXT, technique=t)[1] for t in ("a", "b")) - assert total_attempts == inner.call_count - async def test_passes_attempt_labels_to_inner(self, target, selector, seed_group): - bundles = {"a": _make_bundle(name="a", outcomes=[AttackOutcome.SUCCESS])} - dispatcher = AdaptiveDispatchAttack( - objective_target=target, - techniques=bundles, - selector=selector, ) inner = _patch_inner(dispatcher=dispatcher, bundles=bundles) await dispatcher._perform_async(context=_make_context(labels={"foo": "bar"})) labels = inner.call_args.kwargs["attempt_labels"] - assert labels["foo"] == "bar" # caller labels preserved + assert labels["foo"] == "bar" assert labels[ADAPTIVE_TECHNIQUE_LABEL] == "a" assert labels[ADAPTIVE_ATTEMPT_LABEL] == "1" - async def test_uses_adaptive_context_from_extractor(self, target, selector, seed_group): - # Two techniques; one has been heavily rewarded under context "violence" only. + async def test_metadata_records_adaptive_trail(self, target, seed_group): bundles = { - "a": _make_bundle(name="a", outcomes=[AttackOutcome.SUCCESS]), + "a": _make_bundle(name="a", outcomes=[AttackOutcome.FAILURE]), "b": _make_bundle(name="b", outcomes=[AttackOutcome.SUCCESS]), } - for _ in range(5): - selector.record_outcome(context="violence", technique="b", success=True) - for _ in range(5): - selector.record_outcome(context="violence", technique="a", success=False) - + selector = _StubSelector(technique_order=["a", "b"]) dispatcher = AdaptiveDispatchAttack( objective_target=target, techniques=bundles, selector=selector, - context_extractor=harm_category_context, - ) - inner = _patch_inner(dispatcher=dispatcher, bundles=bundles) - ctx = _make_context(harm_categories=["violence"]) - await dispatcher._perform_async(context=ctx) - # Exploit should have picked "b" first. - chosen_bundle = inner.call_args.kwargs["bundle"] - assert chosen_bundle is bundles["b"] - async def test_falls_back_to_global_context_with_default_extractor(self, target, selector, seed_group): - bundles = {"a": _make_bundle(name="a", outcomes=[AttackOutcome.SUCCESS])} - dispatcher = AdaptiveDispatchAttack( - objective_target=target, - techniques=bundles, - selector=selector, - ) - _patch_inner(dispatcher=dispatcher, bundles=bundles) - await dispatcher._perform_async(context=_make_context(labels={})) - - # The global context bucket received the update. - assert selector.counts(context=GLOBAL_CONTEXT, technique="a") == (1, 1) - - async def test_metadata_records_adaptive_trail(self, target, selector, seed_group): - bundles = {"a": _make_bundle(name="a", outcomes=[AttackOutcome.FAILURE, AttackOutcome.SUCCESS])} - dispatcher = AdaptiveDispatchAttack( - objective_target=target, - techniques=bundles, - selector=selector, max_attempts_per_objective=3, ) _patch_inner(dispatcher=dispatcher, bundles=bundles) @@ -242,17 +211,19 @@ async def test_metadata_records_adaptive_trail(self, target, selector, seed_grou trail = result.metadata["adaptive_attempts"] assert trail == [ - {"technique": "a", "outcome": "failure"}, - {"technique": "a", "outcome": "success"}, + {"technique": "a", "technique_hash": "a", "outcome": "failure"}, + {"technique": "b", "technique_hash": "b", "outcome": "success"}, ] - assert result.metadata["adaptive_context"] == GLOBAL_CONTEXT - async def test_returns_fresh_result_distinct_from_inner(self, target, selector, seed_group): + async def test_returns_fresh_result_distinct_from_inner(self, target, seed_group): bundles = {"a": _make_bundle(name="a", outcomes=[AttackOutcome.SUCCESS])} + selector = _StubSelector(technique_order=["a"]) dispatcher = AdaptiveDispatchAttack( objective_target=target, techniques=bundles, selector=selector, + + ) inner_ids: list[str] = [] @@ -271,10 +242,8 @@ async def _spy(*, bundle, seed_group, attempt_labels): assert len(inner_ids) == 1 assert result.attack_result_id != inner_ids[0] - assert result.conversation_id # carried over from inner assert result.outcome == AttackOutcome.SUCCESS - assert result.metadata["adaptive_attempts"] == [{"technique": "a", "outcome": "success"}] - assert result.metadata["adaptive_context"] == GLOBAL_CONTEXT + assert result.metadata["adaptive_attempts"] == [{"technique": "a", "technique_hash": "a", "outcome": "success"}] @pytest.mark.usefixtures("patch_central_database") @@ -285,6 +254,8 @@ def test_validate_rejects_empty_objective(self, target, selector, seed_group, ba objective_target=target, techniques={"a": _make_bundle(name="a", outcomes=[AttackOutcome.SUCCESS])}, selector=selector, + + ) with pytest.raises(ValueError, match="objective"): dispatcher._validate_context(context=_make_context(objective=bad_objective)) @@ -294,6 +265,7 @@ def test_validate_accepts_normal_objective(self, target, selector, seed_group): objective_target=target, techniques={"a": _make_bundle(name="a", outcomes=[AttackOutcome.SUCCESS])}, selector=selector, + + ) - # Does not raise. dispatcher._validate_context(context=_make_context(objective="ok")) diff --git a/tests/unit/scenario/scenarios/adaptive/test_epsilon_greedy.py b/tests/unit/scenario/scenarios/adaptive/test_epsilon_greedy.py index 42d6b14b4..985a6fe74 100644 --- a/tests/unit/scenario/scenarios/adaptive/test_epsilon_greedy.py +++ b/tests/unit/scenario/scenarios/adaptive/test_epsilon_greedy.py @@ -1,196 +1,132 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. +from unittest.mock import MagicMock, patch + import pytest +from pyrit.analytics.result_analysis import AttackStats from pyrit.scenario.scenarios.adaptive.selectors import ( - GLOBAL_CONTEXT, EpsilonGreedyTechniqueSelector, ) TECHNIQUES = ["a", "b", "c", "d"] -def _seeded_selector( - *, epsilon: float = 0.0, pool_threshold: int = 3, random_seed: int = 0 -) -> EpsilonGreedyTechniqueSelector: - return EpsilonGreedyTechniqueSelector( - epsilon=epsilon, - pool_threshold=pool_threshold, - random_seed=random_seed, - ) +def _seeded_selector(*, epsilon: float = 0.0, random_seed: int = 0) -> EpsilonGreedyTechniqueSelector: + return EpsilonGreedyTechniqueSelector(epsilon=epsilon, random_seed=random_seed) + + +def _empty_rates(*args, **kwargs) -> dict[str, AttackStats]: + """Return empty stats (all techniques unseen).""" + return {} + + +def _rates_with_winner(winner: str, *, successes: int = 5, failures: int = 0): + """Return stats where one technique has a clear win record and others have failures.""" + + def _compute(*args, **kwargs): + stats = {} + total = successes + failures + stats[winner] = AttackStats( + success_rate=successes / total if total else None, + total_decided=total, + successes=successes, + failures=failures, + undetermined=0, + errors=0, + ) + for t in TECHNIQUES: + if t != winner: + stats[t] = AttackStats( + success_rate=0.0, + total_decided=5, + successes=0, + failures=5, + undetermined=0, + errors=0, + ) + return stats + + return _compute class TestEpsilonGreedyTechniqueSelectorInit: def test_init_defaults(self): - selector = EpsilonGreedyTechniqueSelector() - assert selector.snapshot() == {} + EpsilonGreedyTechniqueSelector() @pytest.mark.parametrize("bad_epsilon", [-0.1, 1.1, 2.0, -1.0]) def test_init_rejects_out_of_range_epsilon(self, bad_epsilon): with pytest.raises(ValueError, match="epsilon"): EpsilonGreedyTechniqueSelector(epsilon=bad_epsilon) - def test_init_rejects_pool_threshold_below_one(self): - with pytest.raises(ValueError, match="pool_threshold"): - EpsilonGreedyTechniqueSelector(pool_threshold=0) - with pytest.raises(ValueError, match="pool_threshold"): - EpsilonGreedyTechniqueSelector(pool_threshold=-1) - class TestEpsilonGreedyTechniqueSelectorSelect: - def test_select_empty_techniques_raises(self): + @patch("pyrit.scenario.scenarios.adaptive.selectors.epsilon_greedy.compute_technique_success_rates", side_effect=_empty_rates) + async def test_select_empty_techniques_raises(self, _mock): selector = _seeded_selector() - with pytest.raises(ValueError, match="techniques"): - selector.select(context=GLOBAL_CONTEXT, techniques=[]) - - def test_select_all_unseen_ties_resolved_randomly(self): - # With epsilon=0 and an empty table, every technique has estimate 1/1=1.0, - # so the result is the seeded random tiebreak. Different seeds should - # be able to produce different winners. - winners = { - _seeded_selector(random_seed=s).select(context=GLOBAL_CONTEXT, techniques=TECHNIQUES) for s in range(50) - } + with pytest.raises(ValueError, match="technique_identifiers"): + await selector.select_async(technique_identifiers=[], objective="obj") + + @patch("pyrit.scenario.scenarios.adaptive.selectors.epsilon_greedy.compute_technique_success_rates", side_effect=_empty_rates) + async def test_select_all_unseen_ties_resolved_randomly(self, _mock): + winners = set() + for s in range(50): + sel = _seeded_selector(random_seed=s) + result = await sel.select_async(technique_identifiers=TECHNIQUES, objective="obj") + winners.add(result[0]) assert len(winners) > 1 assert winners.issubset(set(TECHNIQUES)) - def test_select_exploits_clear_winner(self): - selector = _seeded_selector(pool_threshold=1) - # Give "b" a track record of pure success, others pure failure. - for _ in range(5): - selector.record_outcome(context=GLOBAL_CONTEXT, technique="b", success=True) - for technique in ("a", "c", "d"): - for _ in range(5): - selector.record_outcome(context=GLOBAL_CONTEXT, technique=technique, success=False) - - # With epsilon=0, every selection must exploit the winner. + @patch( + "pyrit.scenario.scenarios.adaptive.selectors.epsilon_greedy.compute_technique_success_rates", + side_effect=_rates_with_winner("b"), + ) + async def test_select_exploits_clear_winner(self, _mock): + selector = _seeded_selector() for _ in range(20): - assert selector.select(context=GLOBAL_CONTEXT, techniques=TECHNIQUES) == "b" + result = await selector.select_async(technique_identifiers=TECHNIQUES, objective="obj") + assert result[0] == "b" - def test_select_epsilon_one_is_pure_random(self): + @patch("pyrit.scenario.scenarios.adaptive.selectors.epsilon_greedy.compute_technique_success_rates", side_effect=_empty_rates) + async def test_select_epsilon_one_is_pure_random(self, _mock): selector = _seeded_selector(epsilon=1.0) - # Bias the table heavily toward "a"; with epsilon=1 it must still be ignored. - for _ in range(20): - selector.record_outcome(context=GLOBAL_CONTEXT, technique="a", success=True) - - picks = [selector.select(context=GLOBAL_CONTEXT, techniques=TECHNIQUES) for _ in range(200)] - assert set(picks) == set(TECHNIQUES) - - def test_select_epsilon_zero_never_explores(self): - selector = _seeded_selector(epsilon=0.0, pool_threshold=1) - for _ in range(3): - selector.record_outcome(context=GLOBAL_CONTEXT, technique="a", success=True) - # Make the other techniques tried-and-failed so they fall below "a"'s estimate; - # unseen techniques would otherwise tie at the optimistic 1.0. - for technique in ("b", "c", "d"): - selector.record_outcome(context=GLOBAL_CONTEXT, technique=technique, success=False) - for _ in range(50): - assert selector.select(context=GLOBAL_CONTEXT, techniques=TECHNIQUES) == "a" - - def test_select_cold_start_round_robins(self): - # Optimistic init + epsilon=0: untried techniques tie at 1.0 and beat tried-and-failed - # techniques (1/2 = 0.5). So the first failures push each technique to "tried" exactly once - # before any technique gets tried twice. - selector = _seeded_selector(pool_threshold=1) - tried: list[str] = [] - for _ in range(len(TECHNIQUES)): - technique = selector.select(context=GLOBAL_CONTEXT, techniques=TECHNIQUES) - tried.append(technique) - selector.record_outcome(context=GLOBAL_CONTEXT, technique=technique, success=False) - assert sorted(tried) == sorted(TECHNIQUES) - - -class TestEpsilonGreedyTechniqueSelectorUpdate: - def test_record_outcome_accumulates_counts(self): + picks = set() + for i in range(200): + result = await selector.select_async( + technique_identifiers=TECHNIQUES, objective=f"obj-{i}" + ) + picks.add(result[0]) + assert picks == set(TECHNIQUES) + + @patch("pyrit.scenario.scenarios.adaptive.selectors.epsilon_greedy.compute_technique_success_rates", side_effect=_empty_rates) + async def test_select_returns_multiple_techniques(self, _mock): selector = _seeded_selector() - selector.record_outcome(context="ctx", technique="a", success=True) - selector.record_outcome(context="ctx", technique="a", success=False) - selector.record_outcome(context="ctx", technique="a", success=True) - assert selector.counts(context="ctx", technique="a") == (2, 3) - - def test_record_outcome_separate_contexts_are_independent(self): + result = await selector.select_async( + technique_identifiers=TECHNIQUES, objective="obj", num_top_techniques=3 + ) + assert len(result) == 3 + assert len(set(result)) == 3 # no duplicates + + @patch("pyrit.scenario.scenarios.adaptive.selectors.epsilon_greedy.compute_technique_success_rates", side_effect=_empty_rates) + async def test_select_caps_at_available_techniques(self, _mock): selector = _seeded_selector() - selector.record_outcome(context="x", technique="a", success=True) - selector.record_outcome(context="y", technique="a", success=False) - assert selector.counts(context="x", technique="a") == (1, 1) - assert selector.counts(context="y", technique="a") == (0, 1) + result = await selector.select_async( + technique_identifiers=["a", "b"], objective="obj", num_top_techniques=5 + ) + assert len(result) == 2 + + +class TestEpsilonGreedyEstimate: + def test_estimate_unseen_is_one(self): + assert EpsilonGreedyTechniqueSelector._estimate(technique="a", stats={}) == pytest.approx(1.0) + + def test_estimate_with_data(self): + stats = { + "a": AttackStats( + success_rate=0.6, total_decided=5, successes=3, failures=2, undetermined=0, errors=0 + ) + } + # (3 + 1) / (5 + 1) = 4/6 ≈ 0.6667 + assert EpsilonGreedyTechniqueSelector._estimate(technique="a", stats=stats) == pytest.approx(4 / 6) - def test_counts_default_zero_for_unseen(self): - selector = _seeded_selector() - assert selector.counts(context="missing", technique="missing") == (0, 0) - - def test_record_outcome_keeps_pooled_global_counts_in_sync(self): - # Pooled-global counts back the O(1) pooled-backoff branch in _estimate. - # They must aggregate across contexts for a given technique. - selector = _seeded_selector(pool_threshold=5) - selector.record_outcome(context="x", technique="a", success=True) - selector.record_outcome(context="y", technique="a", success=False) - selector.record_outcome(context="z", technique="a", success=True) - selector.record_outcome(context="x", technique="b", success=True) - - # Below the local threshold, _estimate must use the pooled-global rate. - # technique "a": 2 successes / 3 attempts -> (2+1)/(3+1) = 0.75 - assert selector.success_rate(context="new_ctx", technique="a") == pytest.approx(0.75) - # technique "b": 1/1 -> (1+1)/(1+1) = 1.0 - assert selector.success_rate(context="new_ctx", technique="b") == pytest.approx(1.0) - # Unseen technique "c" -> (0+1)/(0+1) = 1.0 - assert selector.success_rate(context="new_ctx", technique="c") == pytest.approx(1.0) - - -class TestEpsilonGreedyTechniqueSelectorEstimate: - def test_success_rate_unseen_is_one(self): - # Optimistic init: (0 + 1) / (0 + 1) = 1.0 - selector = _seeded_selector() - assert selector.success_rate(context="ctx", technique="a") == pytest.approx(1.0) - - def test_success_rate_local_when_above_threshold(self): - selector = _seeded_selector(pool_threshold=2) - for _ in range(3): - selector.record_outcome(context="ctx", technique="a", success=True) - # (3 + 1) / (3 + 1) = 1.0 - assert selector.success_rate(context="ctx", technique="a") == pytest.approx(1.0) - - def test_success_rate_pools_when_below_threshold(self): - selector = _seeded_selector(pool_threshold=5) - # Local cell has only 1 attempt (below threshold). - selector.record_outcome(context="ctx", technique="a", success=False) - # Other contexts have plenty of data for technique "a". - for _ in range(10): - selector.record_outcome(context="other", technique="a", success=True) - # Pooled estimate = (10 + 0 + 1) / (10 + 1 + 1) = 11/12. - assert selector.success_rate(context="ctx", technique="a") == pytest.approx(11 / 12) - - -class TestEpsilonGreedyTechniqueSelectorConcurrency: - """Concurrent record_outcome / select calls must not corrupt counts.""" - - def test_concurrent_record_outcome_preserves_total_attempts(self): - import threading - - selector = _seeded_selector(pool_threshold=1) - threads_per_arm = 8 - attempts_per_thread = 100 - techniques = ["a", "b", "c", "d"] - - def worker(technique: str, success_pattern: list[bool]) -> None: - for ok in success_pattern: - selector.record_outcome(context=GLOBAL_CONTEXT, technique=technique, success=ok) - - threads: list[threading.Thread] = [] - expected_successes: dict[str, int] = dict.fromkeys(techniques, 0) - for t in techniques: - for i in range(threads_per_arm): - pattern = [(j + i) % 2 == 0 for j in range(attempts_per_thread)] - expected_successes[t] += sum(pattern) - threads.append(threading.Thread(target=worker, args=(t, pattern))) - - for th in threads: - th.start() - for th in threads: - th.join() - - # Every increment landed: no lost updates from interleaved read-modify-write. - for t in techniques: - successes, attempts = selector.counts(context=GLOBAL_CONTEXT, technique=t) - assert attempts == threads_per_arm * attempts_per_thread - assert successes == expected_successes[t] diff --git a/tests/unit/scenario/scenarios/adaptive/test_protocol.py b/tests/unit/scenario/scenarios/adaptive/test_protocol.py deleted file mode 100644 index 5d9b764e7..000000000 --- a/tests/unit/scenario/scenarios/adaptive/test_protocol.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -from unittest.mock import MagicMock - -from pyrit.scenario.scenarios.adaptive.selectors import ( - GLOBAL_CONTEXT, - UNCATEGORIZED_CONTEXT, - EpsilonGreedyTechniqueSelector, - TechniqueSelector, - global_context, - harm_category_context, -) - - -class TestTechniqueSelectorProtocol: - def test_implements_protocol(self): - selector = EpsilonGreedyTechniqueSelector() - assert isinstance(selector, TechniqueSelector) - - -class TestContextExtractors: - def test_global_context_is_constant(self): - sg = MagicMock() - assert global_context(sg) == GLOBAL_CONTEXT - - def test_harm_category_context_joins_sorted_categories(self): - sg = MagicMock() - sg.harm_categories = ["violence", "hate"] - # Multi-category seeds form their own bucket; sorting keeps the key deterministic. - assert harm_category_context(sg) == "hate|violence" - - def test_harm_category_context_single_category(self): - sg = MagicMock() - sg.harm_categories = ["violence"] - assert harm_category_context(sg) == "violence" - - def test_harm_category_context_falls_back_when_empty(self): - sg = MagicMock() - sg.harm_categories = [] - assert harm_category_context(sg) == UNCATEGORIZED_CONTEXT - - def test_harm_category_context_falls_back_when_none(self): - sg = MagicMock() - sg.harm_categories = None - assert harm_category_context(sg) == UNCATEGORIZED_CONTEXT diff --git a/tests/unit/scenario/scenarios/adaptive/test_technique_selector.py b/tests/unit/scenario/scenarios/adaptive/test_technique_selector.py new file mode 100644 index 000000000..5167cf331 --- /dev/null +++ b/tests/unit/scenario/scenarios/adaptive/test_technique_selector.py @@ -0,0 +1,13 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +from pyrit.scenario.scenarios.adaptive.selectors import ( + EpsilonGreedyTechniqueSelector, + TechniqueSelector, +) + + +class TestTechniqueSelectorProtocol: + def test_implements_protocol(self): + selector = EpsilonGreedyTechniqueSelector() + assert isinstance(selector, TechniqueSelector) diff --git a/tests/unit/scenario/scenarios/adaptive/test_text_adaptive.py b/tests/unit/scenario/scenarios/adaptive/test_text_adaptive.py index 13c8fd97b..786f3692a 100644 --- a/tests/unit/scenario/scenarios/adaptive/test_text_adaptive.py +++ b/tests/unit/scenario/scenarios/adaptive/test_text_adaptive.py @@ -18,10 +18,6 @@ from pyrit.scenario.scenarios.adaptive.dispatcher import ( AdaptiveDispatchAttack, ) -from pyrit.scenario.scenarios.adaptive.selectors import ( - GLOBAL_CONTEXT, - harm_category_context, -) from pyrit.scenario.scenarios.adaptive.text_adaptive import TextAdaptive from pyrit.score import TrueFalseScorer @@ -209,47 +205,6 @@ async def test_atomics_share_one_selector_across_dispatchers(self, mock_objectiv selectors = {id(d._selector) for d in dispatchers} assert len(selectors) == 1 - async def test_default_context_extractor_is_global(self, mock_objective_target, mock_objective_scorer): - from pyrit.scenario.scenarios.adaptive.selectors import global_context - - groups = { - "violence": [_make_seed_group(value="obj-1", harm_categories=["violence"])], - "hate": [_make_seed_group(value="obj-2", harm_categories=["hate"])], - } - _scenario, attacks = await self._build_scenario_and_attacks( - mock_objective_target=mock_objective_target, - mock_objective_scorer=mock_objective_scorer, - seed_groups=groups, - ) - for atomic in attacks: - dispatcher = atomic._attack_technique.attack - # All seed groups in a global-extractor scenario resolve to the same - # context bucket regardless of harm category. - for sg in atomic.seed_groups: - assert dispatcher._context_extractor(sg) == GLOBAL_CONTEXT - assert dispatcher._context_extractor is global_context - - async def test_harm_category_extractor_partitions_contexts(self, mock_objective_target, mock_objective_scorer): - groups = { - "violence": [_make_seed_group(value="obj-v", harm_categories=["violence"])], - "hate": [_make_seed_group(value="obj-h", harm_categories=["hate"])], - "uncat": [_make_seed_group(value="obj-u", harm_categories=None)], - } - _scenario, attacks = await self._build_scenario_and_attacks( - mock_objective_target=mock_objective_target, - mock_objective_scorer=mock_objective_scorer, - seed_groups=groups, - context_extractor=harm_category_context, - ) - contexts: set[str] = set() - for atomic in attacks: - dispatcher = atomic._attack_technique.attack - assert dispatcher._context_extractor is harm_category_context - for sg in atomic.seed_groups: - contexts.add(dispatcher._context_extractor(sg)) - # Each harm category gets its own context bucket. - assert contexts == {"violence", "hate", "_uncategorized"} - async def test_atomic_names_are_dataset_scoped(self, mock_objective_target, mock_objective_scorer): groups = { "violence": [_make_seed_group(value=f"obj-{i}", harm_categories=["violence"]) for i in range(5)], @@ -318,9 +273,10 @@ async def test_techniques_with_seed_technique_are_kept(self, mock_objective_targ dispatcher = attacks[0]._attack_technique.attack assert isinstance(dispatcher, AdaptiveDispatchAttack) # Both factories survive; in particular the seeded one is no longer - # silently dropped. - assert "role_play" in dispatcher._techniques - assert "many_shot" in dispatcher._techniques + # silently dropped. Keys are now eval hashes; check by bundle name. + technique_names = {b.name for b in dispatcher._techniques.values()} + assert "role_play" in technique_names + assert "many_shot" in technique_names async def test_incompatible_seed_technique_is_filtered_per_objective( self, mock_objective_target, mock_objective_scorer @@ -353,9 +309,10 @@ async def test_incompatible_seed_technique_is_filtered_per_objective( # shared by the dispatcher; per-call compatibility filtering now # happens inside ``AdaptiveDispatchAttack._perform_async``. The seed # group survived because the plain (no-seed_technique) factory keeps - # the compatible pool non-empty. - assert "role_play" in dispatcher._techniques - assert "many_shot" in dispatcher._techniques + # the compatible pool non-empty. Keys are now eval hashes; check by bundle name. + technique_names = {b.name for b in dispatcher._techniques.values()} + assert "role_play" in technique_names + assert "many_shot" in technique_names assert len(attacks[0].seed_groups) == 1 async def test_objective_skipped_when_no_compatible_techniques( @@ -402,120 +359,6 @@ def _selective_compat(self_group, *, technique): assert any("obj-skip" in record.getMessage() for record in caplog.records) -@pytest.mark.usefixtures(*FIXTURES) -class TestTextAdaptiveSelectorRehydration: - """When resuming, prior dispatch trails should replay into the new selector.""" - - def _build_scenario_no_resume_id(self, *, scorer): - return TextAdaptive(objective_scorer=scorer) - - def test_no_scenario_result_id_is_noop(self, mock_objective_scorer): - from pyrit.scenario.scenarios.adaptive.selectors import EpsilonGreedyTechniqueSelector - - scenario = TextAdaptive(objective_scorer=mock_objective_scorer) - selector = EpsilonGreedyTechniqueSelector() - # No scenario_result_id set -> early return, no errors, no replays. - scenario._rehydrate_selector_from_memory(selector=selector, known_techniques={"a", "b"}) - assert selector.snapshot() == {} - - def test_replays_attempts_from_metadata(self, mock_objective_scorer): - from pyrit.models import AttackResult - from pyrit.scenario.scenarios.adaptive.selectors import EpsilonGreedyTechniqueSelector - - scenario = TextAdaptive(objective_scorer=mock_objective_scorer, scenario_result_id="rid") - - rows = [ - AttackResult( - conversation_id="c1", - objective="o1", - attribution_data={"parent_collection": "adaptive_violence"}, - metadata={ - "adaptive_attempts": [ - {"technique": "a", "outcome": "failure"}, - {"technique": "b", "outcome": "success"}, - ], - "adaptive_context": "violence", - }, - ), - AttackResult( - conversation_id="c2", - objective="o2", - attribution_data={"parent_collection": "adaptive_hate"}, - metadata={ - "adaptive_attempts": [{"technique": "a", "outcome": "success"}], - "adaptive_context": "hate", - }, - ), - ] - - selector = EpsilonGreedyTechniqueSelector() - with patch.object(scenario._memory, "get_attack_results", return_value=rows): - scenario._rehydrate_selector_from_memory(selector=selector, known_techniques={"a", "b"}) - - # Trails replayed verbatim into the per-context table. - assert selector.counts(context="violence", technique="a") == (0, 1) - assert selector.counts(context="violence", technique="b") == (1, 1) - assert selector.counts(context="hate", technique="a") == (1, 1) - - def test_skips_unknown_techniques(self, mock_objective_scorer): - from pyrit.models import AttackResult - from pyrit.scenario.scenarios.adaptive.selectors import EpsilonGreedyTechniqueSelector - - scenario = TextAdaptive(objective_scorer=mock_objective_scorer, scenario_result_id="rid") - rows = [ - AttackResult( - conversation_id="c1", - objective="o1", - attribution_data={"parent_collection": "adaptive_violence"}, - metadata={ - "adaptive_attempts": [ - {"technique": "removed_technique", "outcome": "success"}, - {"technique": "a", "outcome": "failure"}, - ], - "adaptive_context": "ctx", - }, - ), - ] - - selector = EpsilonGreedyTechniqueSelector() - with patch.object(scenario._memory, "get_attack_results", return_value=rows): - scenario._rehydrate_selector_from_memory(selector=selector, known_techniques={"a"}) - - # Only the known technique was recorded. - assert selector.counts(context="ctx", technique="a") == (0, 1) - assert selector.counts(context="ctx", technique="removed_technique") == (0, 0) - - def test_ignores_results_without_adaptive_metadata(self, mock_objective_scorer): - from pyrit.models import AttackResult - from pyrit.scenario.scenarios.adaptive.selectors import EpsilonGreedyTechniqueSelector - - scenario = TextAdaptive(objective_scorer=mock_objective_scorer, scenario_result_id="rid") - rows = [ - AttackResult( - conversation_id="c", - objective="o", - attribution_data={"parent_collection": "adaptive_violence"}, - metadata={}, - ), - ] - - selector = EpsilonGreedyTechniqueSelector() - with patch.object(scenario._memory, "get_attack_results", return_value=rows): - scenario._rehydrate_selector_from_memory(selector=selector, known_techniques={"a"}) - assert selector.snapshot() == {} - - def test_memory_load_failure_is_swallowed(self, mock_objective_scorer): - from pyrit.scenario.scenarios.adaptive.selectors import EpsilonGreedyTechniqueSelector - - scenario = TextAdaptive(objective_scorer=mock_objective_scorer, scenario_result_id="rid") - - selector = EpsilonGreedyTechniqueSelector() - with patch.object(scenario._memory, "get_attack_results", side_effect=RuntimeError("db down")): - # Must not raise; selector remains empty. - scenario._rehydrate_selector_from_memory(selector=selector, known_techniques={"a"}) - assert selector.snapshot() == {} - - @pytest.mark.usefixtures(*FIXTURES) class TestTextAdaptiveBaselinePolicy: async def test_initialize_async_accepts_explicit_baseline(self, mock_objective_target, mock_objective_scorer):