diff --git a/.github/workflows/frontend_tests.yml b/.github/workflows/frontend_tests.yml index ebb97b3a64..0593f9b894 100644 --- a/.github/workflows/frontend_tests.yml +++ b/.github/workflows/frontend_tests.yml @@ -150,3 +150,10 @@ jobs: - name: Run TypeScript type check run: npx tsc --noEmit + + # Tree-UI contract tests rely on `satisfies` clauses that ts-jest does + # not enforce at runtime. tsconfig.contract.json scopes a type-check + # narrowly to the contract files + their type dependencies so backend + # wire-shape drift is caught at build time. + - name: Run tree-UI contract type check + run: npx tsc --noEmit -p tsconfig.contract.json diff --git a/doc/gui/design/01_tree_primitives.md b/doc/gui/design/01_tree_primitives.md new file mode 100644 index 0000000000..6e9655baa8 --- /dev/null +++ b/doc/gui/design/01_tree_primitives.md @@ -0,0 +1,2136 @@ +# Tree-Based UI — Foundational Primitives + +> Status: **DRAFT for review (revision 18)** — design + vocabulary only, no implementation. +> Scope: foundational layer (data model, lifecycle, mapping to backend). +> Out of scope: rendering details, layout algorithm, UI affordances, telemetry. +> **V1 decision (§12.0): conversation tree persistence is client-only React state.** The persistence spike from revision 2 is deferred to V2 (preserved in §11 as future work). One consequence flows down: V1 deliberately does NOT write `conversation_tree_node_id` into `MessagePiece.prompt_metadata`, eliminating the orphaned-pointer concern that motivated the spike (see §7.3). + +## 0. Rolling revision history + +This preamble summarizes the rolling rationale across the doc set ([01_tree_primitives.md](01_tree_primitives.md), [02_tree_ui_affordances.md](02_tree_ui_affordances.md), [03_runner.md](03_runner.md)) so a new reader can see what changed across review cycles without diffing. Each revision absorbed a principal-engineer reviewer pass; closures are referenced from inline `(rev N, per reviewer Finding X)` notes throughout the docs. + +| Rev | Dominant theme | Headline closures | +|---|---|---| +| **15** | Anti-amplification + entry-point hygiene | Q.6 intra-wave memoization cut; §3.1 step 2b retry-failed pre-readiness demotion; 5-step entry-point shim formalized; tag-hygiene gate moved out of the dispatch loop into the shim; §6.4.1 `node.execution = null` on failure made load-bearing for the resolver's `is_stale` predicate. | +| **16** | Undo correctness + wave-summary fidelity | §6.9 `UndoOp` discriminated union with state-snapshot widening (closes the silent half-broken-undo class from Findings 6+7); `complete.summary.failed` bucketed as `{transient, rate_limited, permanent}`; `legacy single-int helper` migration spelled out. | +| **17** | Surface-area cleanup (Nits U–Z) | §9.4.1 reload hoists `parent_conversation_tree_id` from leaf labels; `undoStack` carried into `branchToNewTree` clone alongside `edited` state; `refreshNode(fan_id)` aliased to `refreshSubtree(id)`; `operator: ''` defense-in-depth fallback deleted, replaced with hard assert at the `_build_labels` callsite. | +| **18** | Citation refresh + dimension-B + 4th-pass closures + rubber-duck cheap wins + Q.S.1/Q.S.2 decisions | F7 mechanical sweep updates ~16 `.py:L` citations; dimension-B closes 7 deferred items (`NodeParams` union, `path.edge_slot_for`, lockManager unification, `recordExecution` null-prior semantics, two §5.x operator edge-cases, anchor sweep verified clean); 4th-pass reviewer closes 11 findings including the `cancelWave` execution-clobber gate, the `skipped` wave-summary bucket, the queue-drain-interleaving V1.0 documented limitation, the Picked-state `↻×N` exception, the `CrossTabLockManager` interface-block deletion, the `lastError` auto-clear-on-running rule, and four nit-level fixes. Rubber-duck rev-18 cheap wins: per-leaf `ExecutionRecord` timing fields + per-`WaveEvent` `emittedAt`, `version: number` on `ConversationTreeNodeBase` for V2 last-write-wins forward-compat, i18n string-registry V1.0 commitment, FanNode polymorphism honest naming + axis-addition checklist, `↻` tooltip cost-preview, `permanent` failure class surfaced distinctly in the wave-complete toast, client-side telemetry-vs-privacy line in §15. **Q.S.1 DECIDED:** accept-and-disclose — V1.0 ships without intra-wave memoization; Crescendo cost cliff documented in §1.2; revisit V1.x with [Q.S.4](03_runner.md#12-open-questions) experiment data. **Q.S.2 DECIDED:** operator-as-tag (honor-system) — §9.4.5 scaled back to relocation-only (no anonymous-rejection); the no-labels early-return preserved; V1.1 multi-operator collab revisits. **Q.S.3 remains a V1.0 gate item** pending the Q.S.4 Crescendo experiment outcome. Q.S.5–Q.S.9 are PR-sized follow-ups. | + +The net architectural commitment surface (ConversationTree vs AttackResult split, AR-per-leaf, two-function branching, labels-round-trip contract, failure-class trichotomy + skipped bucket, 5-step entry-point shim, schema-versioned sessionStorage, per-tree `UndoOp[]` with state-snapshot widening, operator-as-tag honor-system per Q.S.2) has been stable since rev 15 and survived four reviewer passes plus a rubber-duck pass. The freshest rubber-duck assessment was *"substantive revisions, not back to the drawing board ... with three landed and the §E Crescendo experiment run, this is a ship-it document"* — of those three, [Q.S.1](03_runner.md#12-open-questions) (DECIDED: accept-and-disclose) and [Q.S.2](03_runner.md#12-open-questions) (DECIDED: operator-as-tag) have landed; only [Q.S.3](03_runner.md#12-open-questions) (per-target rate-limit circuit breaker) remains, gated on the [Q.S.4](03_runner.md#12-open-questions) Crescendo experiment. + +### Version-scope legend + +Sections below carry inline version markers. The whole doc describes the eventual V1 design; V1.0 is the shippable subset. + +| Marker | Meaning | +|---|---| +| **V1.0** | Ships in the first tree-UI release. | +| **V1.1** | Designed-and-scoped; deferred from V1.0 to keep the first release small. Disabled-stub UI lands in V1.0 only where the V1.1 trigger would otherwise be repurposed (avoids behavior-change regressions). | +| **V1.x** | Designed-but-uncommitted; lands when an operator-driven need surfaces. | +| **V2** | Requires server-side conversation tree persistence (§11). | + +The §1 Non-Goals enumerates the explicit V1.0 exclusions; later sections use the markers above on individual subsections. + +## 1. Goals & Non-Goals + +### Goals + +1. **Make branching explicit and visual.** Replace the implicit "reverse-chronological list of forks" ([ConversationPanel.tsx](../../../frontend/src/components/Chat/ConversationPanel.tsx)) with a 2-D tree where every fork, retry, and converter variant is a node the user can see, edit, and reason about. +2. **One fan-out primitive, many axes.** Today "5 retries", "branch into a new conversation", and "apply each of 3 converters" are three different code paths (`max_attempts_on_failure` adds turns; `create_related_conversation_async` adds branches; `convertersApi.previewConversion` is single-shot). Collapse them into a single `FanNode` whose `axis` discriminates `attempt | prompt | converter | target | system_prompt | temperature | …`. Adding a new axis is a registration, not a new node type. (See §4.4.) +3. **Make propagation opt-in and inspectable.** When the user edits an upstream node, downstream nodes mark *stale* but do not auto-rerun. The user explicitly invokes a refresh — per-node, per-subtree, or whole-tree. +4. **Preserve previous executions through edits.** Re-running a node does not destroy what came before; the old `ExecutionRecord` is moved into `executionHistory` (capped, see §6) before the new one is recorded. The backend's append-only `MessagePiece` model ([message_piece.py#L110](../../../pyrit/models/messages/message_piece.py#L110)) handles persistence; the conversation tree layer just keeps the pointers. *Note: this is not the same as "no data duplication" — each branch is a full copy of upstream pieces; see §7 storage cost note.* +5. **Be additive.** The existing linear `ChatWindow` ([frontend/src/components/Chat/ChatWindow.tsx](../../../frontend/src/components/Chat/ChatWindow.tsx)) keeps working; the tree view is a sibling view that operates on the same `AttackResult`. + +### Non-Goals (universal — apply to all V1 releases and beyond) + +- Replacing the linear chat for users who prefer it. +- **Server-side conversation tree persistence.** V1 stores the conversation tree in React state, reconstructed on reload from backend labels (§9.4.1). The orphan-pointer concern from revision 2 evaporates because V1 writes no conversation tree references into the backend (see §7.3). Full server-side conversation trees become a V2 feature (§11). +- **Multi-tab conversation tree synchronization, undo/redo, conversation tree sharing across operators.** All require server-side conversation tree storage; out of V1. +- **Distributed fan-out / queueing / rate-limit-aware scheduling.** V1 is single-user, in-process concurrency with a simple `maxParallel` cap scoped per-Workspace (see §12.2). PyRIT's existing `RoundRobinTarget` ([round_robin_target.py:L15](../../../pyrit/prompt_target/round_robin_target.py#L15)) handles cross-endpoint load balancing transparently at the target layer; the tree runner does not need to. Per-target sub-budgets are a future consideration but not on the immediate roadmap. +- **Auto-layout polish.** Buchheim-Walker via `d3-hierarchy.tree()` for V1.0 (see §8); main-path pinning and adaptive collapse are V1.1. +- **Auto-scoring on every Send.** No "default scorer" concept exists in the GUI's `add_message` flow today; default scorers exist only inside `Scenario` orchestration ([scenario.py:L375-L410](../../../pyrit/scenario/core/scenario.py#L375-L410)). Adding one is out of V1; `ScoreNode` remains always-explicit (§12.4). + +## 1.1 V1.0 explicit exclusions (deferred to V1.1) + +The following are scoped and designed in this doc but **do not ship in the V1.0 release** — they ship in V1.1. Reviewers can read this section as the V1.0 cut surface at a glance. + +- **Workspace tab strip (§13.3+).** V1.0 ships the **minimal Workspace** data model (§13.1) — `{ currentTree; recentTreeIds; settings }` — with a "Switch tree" affordance in the canvas-level ribbon. **V1.1 adds the full tab strip** (`conversationTrees: ConversationTree[]`, drag-reorder, multi-tree concurrency wiring). *Rationale:* the minimal Workspace is ~30 LOC and is the data-model precondition for `branchFromNode` (next item); the tab strip is a UI surface, not a data-model requirement. Splitting them lets V1.0 ship `branchFromNode` without paying for tab-strip UX. +- **`branchFromNode` sibling-subtree variant (§6.5).** V1.0 **ships the always-new-tree variant** (clicking `📋` "Branch from here" / "Clone tree" swaps the active Workspace `currentTree` to the clone; source re-openable from History via auto-reverse, §9.3). **The sibling-subtree-same-canvas variant (`🌿`) is V1.1** — it requires a render-rule disambiguation (dashed "branch" edge style vs. solid fan edges) that is not in V1.0's critical path. V1.0 renders the `🌿` slot as a disabled stub per [02 §2.2](02_tree_ui_affordances.md#22-per-node-action-rail) (slot reservation against UX regression). *V1.0 fallback for side-by-side comparison:* two browser tabs, each holding one Workspace `currentTree`, mediated by the §9.4.3 `BroadcastChannel` advisory lock. +- **Synced-Peers Stack and Stack-`+` gating ([02 §3.2, §3.4a](02_tree_ui_affordances.md#32-synced-peers-stack--synchronized-authoring-surface)).** V1.0 ships Fan-Children Stack ([02 §3.1](02_tree_ui_affordances.md#31-fan-children-stack--visual-aggregation-only)) — the visual aggregation of N identical fan children. The synchronized-authoring surface (fan-through, the `addedToStack` field, parent-walk peer detection, draft-placeholder semantics under Promoted state) lands in V1.1, **with the design treated as provisional pending V1.0 operator feedback** — see [02 §3.2](02_tree_ui_affordances.md#32-synced-peers-stack--synchronized-authoring-surface) banner. +- **Main-path pinning ([02 §4.3](02_tree_ui_affordances.md#43-recommendation-buchheimwalker--pinned-main-path--adaptive-collapse)).** V1.0 renders with plain `d3-hierarchy.tree()`. The `★ Pin as main` affordance on `SendNode` and the centerline-pinning layout pass land in V1.1. +- **Fan axes beyond `attempt` and `converter` (§4.4).** V1.0 ships those two axes (the most-requested operator workflows: re-run N times, sweep converters). `prompt`, `target`, `system_prompt`, `temperature` are scoped here but ship in V1.1+. *Rationale:* the runner branches and DTO mappings differ per axis, and the V1.0 attempt+converter pair already exercises every primitive in the runner; adding more axes is multiplicative test surface that V1.1 absorbs once V1.0 has soaked. +- **Auto-reverse fan-out detection for pre-V1.0 ARs ([§9.3](#93-migration-of-existing-linear-attacks---auto-reverse-to-a-tree)).** V1.0 ships **both** the linear-chain reconstruction AND the V1.0+ fast-path `detect_fans_v10_plus` (§9.3.1) that decodes `labels.tree_path` to rebuild nested fan structure exactly for trees produced by the V1.0 runner — this is the load-bearing path for the §9.4.1 reload-reconstruction story. **The pre-tree-UI fallback `detect_fans_pre_v10`** (the `original_prompt_id` chain-flattening + `wave_id`-disambiguation algorithm for historical ARs that have no `tree_path` label) lands in V1.1. *Why the split:* the V1.0+ fast path is ~30 LOC reading labels the runner already writes; deferring it would mean V1.0 sessions reload as flat lists of leaves, which is operator-hostile and unnecessary. The pre-V1.0 fallback has substantially more edge-case test surface (wave_id disambiguation, nesting-loss caveat, multi-branch-from-same-piece) and operates on data that mostly hasn't been authored yet (the corpus of pre-tree-UI ARs is bounded; the corpus of V1.0 trees is the future). + +These exclusions are inter-related but no longer all-or-nothing: V1.0 keeps `branchFromNode` (the most-used operator motion) by shipping the minimal-Workspace data model; the tab strip, sibling-subtree variant, Synced-Peers Stack, main-path pinning, and extra fan axes are deferred as a coherent V1.1 release. + +## 1.2 V1.0 known limitations (sharp edges in what V1.0 DOES ship) + +Distinct from §1.1 (deferred features). These are limits of features that V1.0 *does* ship — operators will hit them and the design tells them what to do. + +- **No clean-prefix optimization in V1.0 — every dispatch re-fires the full chain from the root** (per implementation-time rubber-duck finding; see [03 §4.1 V1.0 implementation reality](03_runner.md#41-the-resolved-root-to-leaf-path--prepended-fresh_suffix)). The design's clean-prefix-into-`prepended_conversation` optimization (load prior assistant pieces as historical context, dispatch only the stale suffix) requires a per-wave piece cache to fetch the stored assistant pieces by `piece_id`. The cache is **not implemented in V1.0**; without it, the partition resolver has no honest way to populate `prepended_conversation` with prior assistant content. V1.0 ships the dumb-but-correct alternative: **every Send on a leaf's root-to-leaf path enters `freshSuffix`** and re-fires against the target. Cost: editing a single leaf at the bottom of a 10-deep clean chain now costs 11 calls instead of 2 (~5× hot-path regression for the "edit-leaf-only" workflow). Correctness: every dispatch sees the target's actual responses, not fabricated context. **V1.x ships the piece cache** (per the 03 §3.3a `_load_piece_as_request` cache spec); operators get the optimization back at that point. Operator-visible: refreshes take longer and cost more tokens in V1.0 than the doc's theoretical model suggests; the wave-summary's call count reflects the actual dispatch cost. +- **200-turn ceiling per root-to-leaf path** ([§9.4.1, runner §4.2](03_runner.md#42-the-200-message-cap)). `CreateAttackRequest.prepended_conversation` is capped at 200 messages by the backend ([attacks.py model](../../../pyrit/backend/models/attacks.py)). The cap is **per-root-to-leaf path** under AR-per-leaf — a tree with 1000 leaves at 10 turns deep is fine; only a single conversation chain whose clean prefix exceeds 200 turns trips the cap. **V1.0 surfaces a soft warning at 180 turns** in the canvas-level ribbon ([02 §2.3](02_tree_ui_affordances.md#23-canvas-level-affordances)): *"This conversation is approaching the 200-turn ceiling. Use Branch from a midpoint to keep extending."* Operators who do hit 200 see `failed` state on the leaf with a tooltip pointing at `branchToNewTree` (V1.0) as the recovery path. **This IS a new limitation introduced by AR-per-leaf-via-prepended_conversation** — today's chat tab uses `add_message` incrementally, which has no per-conversation cap. Operators rebasing a chain past 200 turns under the tree-UI runner hit a ceiling they don't hit in the chat tab. The trade-off was deliberate: AR-per-leaf simplifies the runner and the History view, and the 200-turn limit affects only the depth-of-single-conversation use case (Crescendo and similar multi-turn attacks); for those, the `branchFromNode` midpoint workflow is acceptable recovery. *V1.1 may revisit* by adding an `add_message`-only chain-extension path for "extend a clean leaf by one turn" (per [03 §8.2](03_runner.md#82-why-every-leaf-uses-create_attack--n-add_messages-not-one-or-the-other-alone) V1.1 follow-up), which would bypass the cap because add_message has none. **Combined with the no-clean-prefix-optimization above:** V1.0's `prepended_conversation` is effectively never the operator's clean history; it carries at most a system message. The 200-turn cap is therefore unreachable in V1.0 traffic under normal use — `prepended` length is 0 or 1. +- **Edits-since-last-Refresh lost on reload OR tree-swap.** §9.4.1's reload-reconstruction replays backend leaves; nodes added/edited but never refreshed have no backend AR and don't come back. Mitigations: §9.4.2 `beforeunload` guard catches reload; §13.1a in-app dirty-edit modal catches `openTree`/`closeTree`/`newTree`. (`branchToNewTree` is exempt per [§13.1](#131-v10-minimal-workspace) — the clone deep-copies the source's `edited` state, so nothing is lost in-session.) Operators see one of two modals before losing work. +- **One foregrounded tree at a time in V1.0.** Side-by-side comparison requires two browser tabs (mediated by the §9.4.3 advisory lock). The full tab strip is V1.1 (§1.1). +- **Pre-V1.0 ARs lose fan-axis intent on V1.1 reconstruction.** V1.0+ trees DO round-trip the fan axis via the `tree_path` label ([03 §4.3](03_runner.md#43-label-writes-the-round-trip-fidelity-contract)) — the JSON-encoded `[[axis, slot], ...]` array preserves each fan ancestor's axis exactly. **Pre-V1.0 ARs** (existed before tree-UI shipped) have no `tree_path` label; V1.1 fallback fanout-detection synthesizes `axis='prompt'` for all reconstructed fans (per [§9.3.1 `detect_fans_pre_v10`](#931-fan-grouping-algorithm-v11--original_prompt_id-chain-flattening--wave_id-disambiguator)). Acceptable: V1.0+ trees round-trip cleanly; older ARs reconstruct with the one-axis-fits-all heuristic. +- **ScoreNode is render-only in V1.0** ([§4.5](#45-observational-nodes-no-side-effect-on-the-conversation)). It displays `MessagePiece.scores` already attached to upstream pieces (e.g., from a Scenario-orchestrated import) but cannot author new scores. The `✏ Configure scorer + params` action rail icon is a disabled stub per [02 §2.2](02_tree_ui_affordances.md#22-per-node-action-rail) — V1.0 operators who want to score a leaf whose upstream has no scores must wait for V1.1's `runScorer(node_id)` operation. `📊 View score distribution` stays enabled (pure read-side aggregation). +- **sessionStorage wipe on schema-version mismatch.** A V1.0 → V1.1 upgrade that changes any persisted sessionStorage shape wipes all `pyrit.*` keys on boot per [§13.1 Schema versioning](#131-v10-minimal-workspace). Operator-visible effect: one toast (*"Saved settings were from a different version and have been reset."*), MRU empty, settings revert to defaults. Trees themselves are not affected — they reconstruct from backend leaves via §9.4.1. The only loss is a pre-V1.0 AR session opened via `openTreeFromAttackResult` but never refreshed (sessionStorage held the `parentSourceConversationId` link; wipe loses it; operator re-opens from History to recover). **Origin-shared sessionStorage collision risk:** if another app at the same browser origin uses `pyrit.*` keys for unrelated purposes, the schema-version-mismatch wipe is a collateral cost; bounded for the internal-tool PyRIT deployment context but worth naming for future shared-origin hosting scenarios. +- **Undo is in-memory and per-tree, capped at 20 entries.** Ctrl-Z within a tree undoes the last 20 structural edits ([§6.9](#69-node-editor-undo-v10)); tree-swap clears the stack and reload loses it. No redo in V1.0 (Ctrl-Shift-Z lands V1.x). No undo for refresh waves themselves — backend `AttackResult`s are append-only; operators recover via reflog `makeCurrent` (§6.7) instead. +- **No tree export / import primitive in V1.0** (per rubber-duck Finding C.6). Sharing a tree definition with a teammate is *only* via the source AR id + the recipient's `openTreeFromAttackResult` (auto-reverse path, §13.1) — which loses authoring state (unrefreshed nodes, `promotedChildSlotIndex`, `displayName`, undoStack). V1.x adds a JSON-export / import affordance scoped to the `ConversationTree` shape (no `ExecutionRecord` snapshot — the recipient re-fires Refresh against the source ARs they already have). Operators wanting reproducibility today should rely on the V1.0 auto-reverse path and accept the authoring-state loss. +- **i18n is V1.x; V1.0 makes one cheap commitment to keep migration tractable.** All operator-visible strings (toasts, modal copy, action-rail tooltips, action-row labels) live in a single registry at `frontend/src/strings/tree.en.ts` from day 1 — not scattered across 50 components. The registry is a flat `Record` keyed by stable identifier; component code reads `t('wave.complete.toast')` rather than embedding the English string. V1.0 ships English-only; V1.x adds a sibling `tree..ts` file and a locale-resolver. Without this commitment, V1.x i18n becomes a 2-week refactor instead of a translation-file PR. + +## 2. Vocabulary + +The single most important separation in this design: + +| Term | Meaning | Lifecycle | Persisted | +|---|---|---|---| +| **ConversationTree** | The tree the user is authoring: nodes + edges + parameters | Mutable, edited live | **V1: client-only React state, lost on reload.** V2: server-side resource (§11). | +| **Execution** | A record of what was actually sent and what came back | Append-only | Existing backend: `AttackResult` + `MessagePiece` | +| **Tree label** | A label written on every `AttackResult` produced from the same conversation tree | Set on create, immutable | `AttackResult.labels["conversation_tree_id"]` — enables grouping leaves in the history view | +| **Lineage link** | *(V2 only)* Pointer from `MessagePiece` back to a conversation tree node | Set on write | `MessagePiece.prompt_metadata["conversation_tree_node_id"]` — V1 omits this (see §7.3) | + +A tree node may have **zero or many** executions over its lifetime. Re-running a node creates a new execution; old ones move into `executionHistory` (capped — see §6). + +Additional terms used throughout: + +- **ConversationTreeNode** — a single vertex in the conversation tree (typed; see §4). +- **ConversationTreeEdge** — a directed dependency: `parent → child` means "child's input includes parent's output". Edges are not arbitrary — the tree shape is constrained (see §5). +- **Draft / Clean / Dirty / Stale / Running / Failed** — node states (see §6). +- **Branch from node** — given any node X in a ConversationTree, produce a fresh ConversationTree containing the root-to-X path plus X's descendants (no siblings of path nodes). New nodes share execution refs with the source until edited. UI labels: **"Clone tree"** when X is the root, **"Branch from here"** otherwise. See §6.5. +- **Fan-out** — a `FanNode` (§4.4): one input, N children, each child differs in exactly one parameter (the *axis*). +- **Leaf Send** — a `SendNode` with no `SendNode` descendant. Under the V1 materialization rule (§7.2), each leaf path of the conversation tree maps to **exactly one `AttackResult`** (matches today's `handleBranchAttack` semantics). +- **Side-effect class** — the four runner branches that node kinds factor into: *Source* (no input), *Transform* (pure 1→1), *Side-effecting* (calls the target), *Structural* (changes shape only), *Observational* (reads, never writes the conversation). §4 is organized along this spine. + +## 3. Conceptual Model + +```mermaid +flowchart LR + subgraph ConversationTree["ConversationTree (mutable, in the GUI)"] + P1[RootPrompt] + P2["FanNode(axis=converter)"] + P3a[UserTurn variant A] + P3b[UserTurn variant B] + P4a[Send] + P4b[Send] + P1 --> P2 --> P3a --> P4a + P2 --> P3b --> P4b + end + subgraph Exec["Execution (append-only, backend)"] + E1[(MessagePiece a)] + E2[(MessagePiece b)] + AR1[("AttackResult #1 / conversation_tree_id=T")] + AR2[("AttackResult #2 / conversation_tree_id=T")] + end + P4a -. "executes as" .-> E1 + P4b -. "executes as" .-> E2 + E1 --> AR1 + E2 --> AR2 +``` + +The conversation tree is the **recipe**. The execution is the **record**. The tree is the visual representation of the conversation tree; the linear chat ([MessageList.tsx](../../../frontend/src/components/Chat/MessageList.tsx)) becomes one *projection* of the conversation tree along a chosen root-to-leaf path. **Each leaf `Send` produces its own `AttackResult`**; all `AttackResult`s from one conversation tree share `labels.conversation_tree_id` so the history view can group them (see §7). + +### 3.1 Why a separate conversation tree layer? + +Three forces push us here: + +1. **Edits must not destroy history.** PyRIT's storage is append-only (every duplication preserves `original_prompt_id`; see [`_duplicate_conversation_up_to`](../../../pyrit/backend/services/attack_service.py#L824-L870)). A "live edit" cannot mutate a `MessagePiece` in place. So the editable surface must live elsewhere — that's the conversation tree. +2. **Fan-out is a recipe, not a record.** "Run 5 attempts" is a single user intent. The 5 resulting conversations are 5 records. Modeling them as one conversation tree node with 5 child executions matches user intent and lets us redo / partially re-run cleanly. +3. **Today's UI conflates the two.** The "Branch into new attack" button ([ChatWindow.tsx#L456-L475](../../../frontend/src/components/Chat/ChatWindow.tsx#L456-L475)) is a one-shot deep-copy; the user has no handle on the relationship between the source and the branch other than `original_prompt_id`. The conversation tree layer is exactly that handle. + +### 3.2 Alternatives considered and rejected + +The ConversationTree/Execution split is a choice, not the only option. The three alternatives a principal-engineer review will ask about: + +| Alternative | Idea | Why we reject for V1 | +|---|---|---| +| **Render-only over backend lineage** | No conversation tree layer; project a tree directly from existing `AttackResult.related_conversations` + `MessagePiece.original_prompt_id` | Fan-out has no backend representation. `original_prompt_id` says "this piece was copied from that piece"; it cannot say "these N siblings are one fan-out intent." Render-only would either need a backend schema change (defeating "no new endpoints") or would silently lose the user's intent on reload. | +| **Pure event log + projection** (event sourcing) | ConversationTree as an append-only log of `addNode`/`editNode`/`refresh` events; current state is a projection | Buys real multi-tab and undo/redo. Costs an order of magnitude more design effort and obscures the otherwise-obvious mapping in §7. Right to defer; wrong to never name. Revisit if multi-tab becomes a P0. | +| **CRDT-style versioned node graph** | Per-node version vectors; merge on conflict | Solves multi-tab. Consumes the entire complexity budget. Not justified by single-operator use. | +| **No conversation tree layer; backend orchestrator** | Push fan-out into PyRIT executors (e.g., a new `FanOutAttack`) and treat the UI as a thin shell | Would make scenarios the source of truth for tree shape - reasonable long-term, but requires designing the orchestrator first. Backwards-compatible to layer on after V1 ships. | + +We pick ConversationTree/Execution because (a) it makes fan-out expressible without backend changes, (b) the mapping to existing endpoints is mechanical (§7), and (c) it is the smallest layer that captures the user's stated intent (edit upstream, propagate down opt-in). The §11 spike will decide whether the *conversation tree itself* lives client- or server-side. + +## 4. Node Taxonomy + +Six kinds, organized by **side-effect class** (the spine that drives runner branches, test surface, and editor design). The five families in the previous revision are gone — they were documentation, not abstraction. Each side-effect class corresponds to exactly one branch in the runner. + +```ts +// /frontend/src/components/Tree/types.ts (proposed) + +export type ConversationTreeNodeId = string // UUID v4, stable across edits + +export interface ConversationTreeNodeBase { + id: ConversationTreeNodeId + kind: ConversationTreeNodeKind + parentId: ConversationTreeNodeId | null // null = root + /** + * SHA-256 of the resolved input bundle (see §5). Cached; recomputed whenever + * this node or any ancestor is edited. Crucially, for children of a FanNode + * the hash MUST include the edge's `slotIndex` so siblings have distinct + * hashes even when their parent's resolved input is identical. + */ + resolvedInputHash: string + state: NodeState // see §6 + execution: ExecutionRecord | null // most recent; older ones in executionHistory + executionHistory: ReflogEntry[] // capped, see §6; each entry wraps an immutable ExecutionRecord with per-tree state (pinned flag, etc.) + /** + * Operator-readable error reason populated when the node transitions to `failed` + * or `cancelled` (or to `stale` via the §5.3 in-flight cascade). Cleared when the + * node transitions back to `running` (on retry) or to `clean` (on successful + * re-dispatch). Set by `RunnerStateSink.setNodeState` via its `opts.reason` + * argument (which accepts either a plain string for non-API-error cases or an + * `ApiErrorReason` struct for API-error paths per [03 §3.3a](03_runner.md#33a-helpers-referenced-by-the-dispatch-step)). + * Visible in the right-side drawer's `Current` tab and as the tooltip on the + * node's ⚠ chip ([02 §5.14](02_tree_ui_affordances.md#514-partial-failure-mid-refresh)). + * + * `failure_class` discriminates the four operator-meaningful failure modes; the + * wave-summary buckets per-leaf failure counts by this field per [03 §6 WaveEvent](03_runner.md#6-wave-bookkeeping). + * `'blocked'` is runner-synthesized when this node was dropped from `ready` by the + * [03 §5.3](03_runner.md#53-cascade-on-failure) in-flight cascade — distinguishable + * from the originating Send's actual failure_class (which surfaces on the + * originator's own `lastError`, not on the blocked siblings'). + */ + lastError: { + message: string + failure_class: 'transient' | 'rate_limited' | 'permanent' | 'blocked' + } | null + labels: Record // operator, operation, plus user-defined + /** + * True iff this node was created as part of a Stack-`+` operation that added + // V1.1: addedToStack field is added in V1.1 (see [02 §6.1](02_tree_ui_affordances.md#61-addedtostack-on-conversationtreenodebase-v11)). + // Per Patch #7 (revision 9), V1.0 omits the field entirely. TypeScript is + // structural; V1.1 adds it as a non-breaking type extension with `false` + // default for any node created under V1.0 code paths (correct semantics: + // V1.0 had no Stack-`+` so nothing was operator-stacked). + createdAt: string + updatedAt: string + /** + * Monotonic counter bumped on every `editParams` / `regenerateFanChildren` / + * `makeCurrent` mutation. **V1.0** reads this only for telemetry / debug logs. + * **V2** uses it as the last-write-wins key for the server-side collaborative-tree + * concurrency model ([§13.8](#138-multi-operator-collaboration-v2)). Carrying it in V1.0 + * costs nothing at the data-model layer and makes V2 a non-migration: V2 reads + * `version` directly off V1.0-authored nodes loaded from sessionStorage with no + * defaulting needed (default 1 for newly-minted nodes; the V1.0 mutators that + * already bump `updatedAt` also bump `version`). + */ + version: number +} + +export type ConversationTreeNodeKind = + | 'root_prompt' // §4.1 — Source + | 'import_message' // §4.1 — Source + | 'user_turn' // §4.2 — Transform (also covers manual override via role) + | 'send' // §4.3 — Side-effecting + | 'fan' // §4.4 — Structural + | 'score' // §4.5 — Observational +``` + +| Side-effect class | Kinds | Runner behaviour | +|---|---|---| +| **Source** | `root_prompt`, `import_message` | Produce an initial bundle; no API call for `root_prompt`, single `POST /attacks` for `import_message` | +| **Transform** | `user_turn` | Pure 1→1; no API call by itself — it appends to the upstream bundle. The `Send` child of a `UserTurn` is what hits the wire | +| **Side-effecting** | `send` | One `POST /attacks/{id}/messages` per refresh; the only node that mutates external state | +| **Structural** | `fan` | No API call; manages child set, slot assignment, and slotIndex hashing | +| **Observational** | `score` | Reads `MessagePiece.scores` from existing pieces; in V2 may issue scorer requests | + +### 4.1 Source nodes (no input) + +```ts +export interface RootPromptNode extends ConversationTreeNodeBase { + kind: 'root_prompt' + params: { + text: string + attachments: PieceSpec[] // text/image/audio/video/binary + systemPrompt?: string + targetRegistryName: string // default target for downstream Send nodes + } +} + +export interface ImportMessageNode extends ConversationTreeNodeBase { + kind: 'import_message' + params: { + sourceConversationId: string // existing conv to seed from + cutoffIndex: number // see CreateAttackRequest.cutoff_index + /** + * NOTE: V1 does NOT verify that the caller has permission to read the + * source. The backend's `create_attack_async` will happily duplicate any + * conv by ID (see attack_service.py:L302-L316). Operator isolation today + * is enforced only on `add_message` via `_validate_operator_match` + * (attack_service.py:L682). Tightening import-time auth is tracked in §9. + */ + } +} +``` + +**Target inheritance from imported context (V1.0).** When a Send descendant of an `ImportMessageNode` dispatches, the runner inherits the target from the import-source AR (resolved via `GET /attacks?conversation_id=sourceConversationId` at import time, cached on the node). The operator does NOT pick a target at Send-creation time in V1.0 — that's the `🎯 Change target` affordance on `SendNode` (V1.1 only, per [02 §2.2](02_tree_ui_affordances.md#22-per-node-action-rail)) and the `Fan(axis='target')` axis (V1.1). For V1.0 trees that extend an imported chain, the inherited target is presented in the SendNode card as `target: gpt-4o (inherited from import)` for visual confirmation; operators who want to change the target must wait for V1.1 OR clone the tree (`branchToNewTree` ships V1.0) to a fresh root and pick a target there. + +`ImportMessageNode` is how the tree view picks up where the linear chat left off. The migration of existing linear attacks into the tree view is detailed in §9.3. + +### 4.2 Transform nodes (1 in → 1 out, pure) + +A single kind, with `role` as a discriminator. The previous `EditNode` collapses into this one — the backend already supports `role='simulated_assistant'` ([attack_service.py#L314](../../../pyrit/backend/services/attack_service.py#L314)) for inert/injected context, so a dedicated kind was redundant. + +```ts +export interface UserTurnNode extends ConversationTreeNodeBase { + kind: 'user_turn' + params: { + /** + * Default role is 'user' (a normal turn). Set to 'simulated_assistant' to + * inject a fake assistant turn (the backend marks these inert so the target + * does not reinterpret them). Set to 'system' for a system message. + * The plain string 'assistant' is intentionally not in this union — real + * assistant turns only come from a Send node, never from the operator. + */ + role: 'user' | 'simulated_assistant' | 'system' + text: string + attachments: PieceSpec[] + /** Sequential converter pipeline (matches AddMessageRequest.converter_ids). */ + converterPipeline?: ConverterRef[] + } +} +``` + +`converterPipeline` is the **sequential pipeline** the backend already supports ([converter_service.py#L605-L650](../../../pyrit/backend/services/converter_service.py#L605-L650)) — value flows through each converter in order. When the user wants cartesian/sweep instead, they place the upstream in a `FanNode(axis='converter')` (§4.4). The two semantics are independently composable: a `UserTurn` may chain `[Base64, Compress]` as a pipeline, and a `Fan(axis='converter', variants=[ROT13, AsciiArt])` upstream of it would produce two `UserTurn` branches, each running its child pipeline. + +### 4.3 Side-effecting nodes + +```ts +export interface SendNode extends ConversationTreeNodeBase { + kind: 'send' + params: { + /** May override the target inherited from the upstream RootPromptNode. */ + targetRegistryName?: string + /** Optional send-time converters; merged after the upstream UserTurn's pipeline. */ + converterPipeline?: ConverterRef[] + } +} +``` + +A `SendNode` is the **only** node that mutates external state (one `POST /attacks/{id}/messages`, [routes/attacks.py#L440-L478](../../../pyrit/backend/routes/attacks.py#L440-L478)). Its `execution` field records the assistant response. Refreshing it is the only operation that incurs token cost. + +### 4.4 Structural nodes — the uniform FanNode shape (per-axis dispatch) + +The previous revision had four `*Fan` kinds (`AttemptFan`, `ConverterFan`, `PromptFan`, `TargetFan`). They differed only in *which dimension is varied per child*. Collapsed to one node with a typed axis. + +> **Honest framing (rev 18, per rubber-duck Finding D.2).** The FanNode *type* is uniform; the *behavior* across axes is a polymorphic dispatch table. "Adding a new axis is a registration" (§1 goal #2) is aspirational — the actual work is a 4-tuple per axis: (a) extend the `FanVariant` discriminated union with the new payload shape; (b) add a resolver case in [03 §3.3a](03_runner.md#33a-helpers-referenced-by-the-dispatch-step) that maps the payload into per-piece `MessagePieceRequest` overrides and/or per-attack request fields; (c) decide the persistence story — some axes (e.g., `temperature`) are not recoverable from current backend state and need a new label round-tripped per [03 §4.3](03_runner.md#43-label-writes-the-round-trip-fidelity-contract); (d) add a reconstruction case in [§9.3.1 variant-payload reconstruction](#931-fan-grouping-algorithms). Use this checklist when adding `prompt` / `target` / `system_prompt` / `temperature` in V1.1+. The uniform shape is what makes the dispatch table *small* and *centralized* (one resolver, one reconstruction file); without that uniformity the runner would carry four per-axis code paths instead of one parametric one. + +> **Version scope.** The `FanAxis` type below enumerates the full design surface. **V1.0 ships `attempt` and `converter` axes only.** `prompt`, `target`, `system_prompt`, and `temperature` are scoped for V1.1+. The runner branches and DTO mappings differ per axis; V1.0's two-axis surface is enough to exercise every runner primitive (single-target re-execution, converter-pipeline mutation, AR-per-leaf materialization). V1.1 adds the remaining axes without changing the type. +> +> Operator-visible consequence in V1.0: the `🔀 Fan out` submenu in [02 §2.1](02_tree_ui_affordances.md#21-per-edge-insert-on-edge-) shows `attempt` and `converter` enabled; the others render as disabled menu items with a "V1.1" badge so operators learn the surface area. + +```ts +export type FanAxis = + | 'attempt' // V1.0 — identical inputs; N independent re-runs + | 'converter' // V1.0 — each variant appends a converter pipeline + | 'prompt' // V1.1 — each variant overrides upstream text/attachments + | 'target' // V1.1 — each variant changes the target (spawns new AttackResult) + | 'system_prompt' // V1.1 — each variant overrides the upstream system prompt + | 'temperature' // V1.1+ — each variant tweaks target params + // ...extensible by registration, not by code change + +export interface FanNode extends ConversationTreeNodeBase { + kind: 'fan' + params: { + axis: FanAxis + /** + * For axis='attempt', variants is an array of N empty objects (only count matters). + * For other axes, each variant carries the per-child override payload. + */ + variants: FanVariant[] + /** + * For multi-value axes (e.g. converter), how to combine multiple variants. + * 'each' : len(variants) children (default; current scope) + * 'cross' : v2 — Cartesian product when a single axis carries multiple sub-values. + * EXPLICITLY out of V1 scope to avoid the cardinality ambiguity the + * previous revision left undefined. Nested fan-out via parent/child + * composition is the V1 way to express products. + */ + mode?: 'each' + /** + * Optional: the slotIndex of one child to mark as "promoted". UI renders + * the promoted child at full opacity with a highlight border; other children + * are dimmed ("frozen") and do not receive stack-edits or new synced + * children. Set by the "Pick one" UI affordance (02_tree_ui_affordances.md + * §3.3); cleared by "Unpick". The cherry-pick analogue from the git mental + * model in §6.8. Null = all children synced (default). + * + * Promotion is purely a UI/editing concern; runner ignores this field and + * always refreshes every stale descendant. Operators who want "only refresh + * the promoted path" use a per-call option, not this field. + */ + promotedChildSlotIndex: number | null + /** + * Slot indices that have been deleted from this fan. The §5.1 invariant + * "slot stability" says deleted children's slotIndices become tombstones + * (siblings do not renumber). Recording the tombstones explicitly here + * makes the invariant runtime-checkable: the next slot allocated to a new + * variant is `max(variants[].slotIndex ∪ deletedSlotIndices) + 1`, never + * a recycled index. Empty for fresh fans. + */ + deletedSlotIndices: number[] + } +} + +export type FanVariant = + | { axis: 'attempt'; payload: Record } + | { axis: 'prompt'; payload: { text: string; attachments?: PieceSpec[] } } + | { axis: 'converter'; payload: { converters: ConverterRef[] } } + | { axis: 'target'; payload: { targetRegistryName: string } } + | { axis: 'system_prompt'; payload: { systemPrompt: string } } + | { axis: 'temperature'; payload: { temperature: number } } +``` + +**Cartesian products compose by nesting.** *"3 prompts × 5 converters × 4 attempts"* is: + +``` +RootPrompt +└─ Fan(axis='prompt', variants=p1,p2,p3) + └─ (per child) UserTurn + └─ Fan(axis='converter', variants=c1..c5) + └─ (per child) UserTurn + └─ Fan(axis='attempt', variants=[{},{},{},{}]) + └─ (per child) Send +``` + +60 leaf `Send` nodes, each independently re-runnable. See Appendix A for the full materialization. + +**Implementation note on child generation.** Fan children are *materialized* in the conversation tree (each child is a real `ConversationTreeNode` with its own `id` and editable params). This matters because: + +- Per-child state (clean / edited / stale / failed) lives on each leaf. +- The user can edit one child (e.g., tweak the text on attempt #3) without affecting siblings. +- Re-running the parent does not regenerate children unless the user explicitly requests "regenerate children" (which is a destructive op that resets per-child edits). +- `slotIndex` is the stable identity of a child within its parent. Deleting a child tombstones the slot — sibling slot indices do not shift. + +### 4.5 Observational nodes (no side effect on the conversation) + +```ts +export interface ScoreNode extends ConversationTreeNodeBase { + kind: 'score' + params: { + scorerType: string + scorerParams?: Record + } + // execution.result holds the score; no MessagePiece is added to the conversation. +} +``` + +`ScoreNode` attaches scoring (truthfulness, harm category, etc.) at any point in the tree. + +**V1.0 scope: read-only display of pre-existing scores.** V1.0 ships `ScoreNode` as a **display surface only** — it reads scores already attached to the upstream `MessagePiece.scores` ([models/attacks.py#L20-L31](../../../pyrit/backend/models/attacks.py#L20-L31)) and renders them in the node card; **the runner does not issue scorer requests** (per [§12.4](#124-no-auto-scoring-on-send---decided-v10)). This means dragging a `ScoreNode` onto a leaf whose ancestor pieces have no scores produces a node that renders as `(no scores)` — visually present but inert. Operators see scores from imported attacks (e.g., a Scenario-orchestrated run with default scorers; [scenario.py:L375-L410](../../../pyrit/scenario/core/scenario.py#L375-L410)) but cannot create scores from inside the tree view in V1.0. **The `✏ Configure scorer + params` action rail icon renders as a disabled stub** per [02 §2.2](02_tree_ui_affordances.md#22-per-node-action-rail) (slot reservation against UX regression) — V1.0 cannot honor a configured scorer because the runner never invokes one. `📊 View score distribution` stays enabled in V1.0 as a pure read-side aggregation over upstream scores. + +**Runner state for V1.0 ScoreNodes:** treated as `clean` after the [03 §3.3a `reconcileTransformStates`](03_runner.md#33a-helpers-referenced-by-the-dispatch-step) walk; never enters the `ready` queue (no dispatch). Score values are read at render time from the upstream `MessagePiece.scores` already loaded in the tree's React state. + +**V1.1+:** add an explicit `runScorer(node_id)` operation that POSTs to a `/api/scores` endpoint (does not exist yet; tracked as backend ask) and writes the result into `execution.result`. At that point `ScoreNode` joins the dispatch surface as its own side-effect class. + +### 4.6 Shared types + +```ts +export interface ExecutionRecord { + /** UUID v4 generated by the runner. Replaces the prior timestamp-based ID + * to avoid collisions when multiple sends fire in the same ms. */ + executionId: string + attemptedAt: string + attackResultId: string | null // which AttackResult this execution belongs to + conversationId: string | null // which conversation in that AttackResult + pieceIds: string[] // MessagePiece IDs produced by this execution + outcome: 'success' | 'failure' | 'error' | 'cancelled' | 'pending' + errorMessage?: string + /** For replay / debugging — the hash that was current when this execution started. */ + resolvedInputHashAtExecution: string + /** + * **Per-leaf timing fields (rev 18, per rubber-duck Finding C.1).** All three are + * ISO-8601 UTC strings; all three are nullable to cover failures that never reached + * the target. The runner writes these inline with state transitions — `dispatchedAt` + * at the `running` transition, `targetFirstByteAt` when the first response chunk + * arrives (or on `add_message`'s response for non-streaming targets), `completedAt` + * at the terminal `clean` / `failed` / `cancelled` transition. Implementers MUST + * populate all three on successful dispatches; UI surfaces (the [02 §8.2 Recent waves + * drawer](../../../doc/gui/design/02_tree_ui_affordances.md#82-the-v1-drawer-a-recent-waves-tab)) + * compute `target_latency_ms = completedAt - dispatchedAt` for per-leaf rows. This + * is what makes the [03 §11.1](03_runner.md#111-unit-testable-in-isolation-no-backend) + * `inflight.size <= maxParallel` invariant validatable in production rather than + * only in unit tests. + */ + dispatchedAt: string | null + targetFirstByteAt: string | null + completedAt: string | null +} + +/** + * Per-tree wrapper around an ExecutionRecord. The `execution` itself is immutable + * and may be SHARED across cloned trees (per §6.5 sharing semantics); the wrapper + * carries per-tree state such as the `pinned` flag (per §6.6 `pinExecution`). Each + * tree's `executionHistory` is a shallow-copied array of these wrappers, so a pin + * or eviction in tree A does not affect tree B's view of the same shared + * ExecutionRecord. The runner only reads `entry.execution`; the wrapper fields are + * pure tree-side state and never sent to the backend. + */ +export interface ReflogEntry { + execution: ExecutionRecord // immutable; shareable across trees + pinned: boolean // per-tree; default false; survives reflog eviction when true +} + +export interface ConverterRef { + // Either a stored converter instance (preferred — matches converter_id in the backend) + converterId?: string + // Or an inline spec (for ephemeral converters added in the tree view) + inline?: { + type: string // ConverterType class name + params: Record + } +} + +export type PromptDataType = 'text' | 'image_path' | 'audio_path' | 'video_path' | 'binary_path' + +export interface PieceSpec { + dataType: PromptDataType + value: string // text or base64 or path + mimeType?: string + originalPromptId?: string // matches MessagePieceRequest.original_prompt_id +} + +/** + * Failure-class discriminator carried on every `lastError` per [§6.1](#61-states). + * - 'transient' : 5xx, network, timeout. [Retry failed] retries. + * - 'rate_limited' : HTTP 429 or provider-specific overloaded shapes (Anthropic + * overloaded_error, OpenAI rate_limit_exceeded, etc.). [Retry failed] + * excludes these from the retry set; operator waits + Refresh tree. + * - 'permanent' : 4xx other than 429 (validation, operator-lock mismatch, + * target-not-found). [Retry failed] excludes these too \u2014 operator + * must fix the cause and re-trigger. + * - 'blocked' : runner-synthesized when this node was dropped from `ready` by the + * [03 \u00a75.3](../doc/gui/design/03_runner.md#53-cascade-on-failure) + * in-flight cascade. Node state is `stale` (not `failed`); see [\u00a76.1](#61-states). + */ +export type NodeFailureClass = 'transient' | 'rate_limited' | 'permanent' | 'blocked' + +/** + * Structured error reason returned by `_format_api_error` ([03 \u00a73.3a](../doc/gui/design/03_runner.md#33a-helpers-referenced-by-the-dispatch-step)) + * and passed into `RunnerStateSink.setNodeState(opts.reason)`. The sink writes it + * directly into the node's `lastError` per [\u00a76.1](#61-states). + */ +export interface ApiErrorReason { + message: string + failure_class: NodeFailureClass +} +``` + +## 5. Edge & Data-Flow Model + +```ts +export interface ConversationTreeEdge { + id: string + parentId: ConversationTreeNodeId + childId: ConversationTreeNodeId + /** + * For FanNode parents, identifies which variant this edge feeds. For + * non-fan parents, slotIndex is 0. + * + * INVARIANT: slotIndex MUST be incorporated into the child's + * `resolvedInputHash` (see below). Without this, all N children of an + * `attempt`-axis fan have identical hashes and per-child edited/stale + * tracking is broken. + */ + slotIndex: number +} +``` + +### 5.1 Invariants + +1. **Tree, not DAG.** Every node has exactly one `parentId` (the root has `null`). Fan nodes have N outgoing edges but each child has exactly one parent. (V2 may relax this for `best_of` aggregation.) +2. **Slot stability.** When a fan node's child is deleted, the `slotIndex` of remaining children does not change — the deleted slot becomes a tombstone. This keeps "attempt #3" identifiable across edits and across rehydration of a persisted conversation tree. +3. **Edges are derived, not authored.** Users add/remove nodes; the edge set follows from `parentId` + slot assignment. Cycles are impossible by construction. +4. **Hash uniqueness across fan siblings.** Two children of the same fan must hash differently iff at least one of `(slotIndex, variant payload)` differs. The `attempt` axis is the degenerate case: variant payload is empty, so `slotIndex` is the only discriminator. Bake this into the hash function. +5. **Leaf-input ancestor shape.** A `SendNode`'s **first non-Fan, non-Score ancestor on the root-to-leaf path** is always either a `UserTurnNode` with `role='user'` or a `RootPromptNode` (the very-first Send of a fresh tree, treating Root's text as the first user turn). The ancestor is the Send's *input* — the user-role turn whose content the Send fires at the target. `'simulated_assistant'` and `'system'` UserTurn roles are inert by construction ([§4.2](#42-transform-nodes-1-in--1-out-pure)) and never act as a Send's input. **Fan and Score ancestors are transparent** — they sit between a Send and its input UserTurn without changing what the input is. This is critical for fan-children: a `Fan(axis='attempt')` directly above a Send is the common case, and the Send's input UserTurn is the UserTurn ABOVE the Fan (shared across all attempt siblings, varied only by the slot's variant payload per [§4.4](#44-structural-nodes--the-single-fan-out-primitive)). The runner's resolver ([03 §4.1](03_runner.md#41-the-resolved-root-to-leaf-path--prepended-fresh_suffix)) walks through Fan/Score ancestors transparently to find the Send's input. Violations are runner bugs, not operator errors. + +### 5.2 Resolved input — specification + +Every non-source node has a *resolved input* — the byte-exact bundle that would be sent on the next downstream `Send`. It is a pure function of the parent's resolved input, this node's params, and (for fan children) the edge slotIndex/variant: + +``` +resolvedInput(node) = transform(node.kind, node.params, edge.slotIndex, edge.variant, resolvedInput(node.parent)) +``` + +The `transform` per kind: + +| Kind | Behaviour | +|---|---| +| `root_prompt` | Returns the seed bundle: `{ messages: [], systemPrompt, target, attachments }` | +| `import_message` | Returns the bundle hydrated from `GET /attacks/.../messages?conversation_id=…` clipped to `cutoffIndex` | +| `user_turn` | Returns parent bundle with an extra `Message` appended: `{ role: params.role, text, attachments, converterPipeline }` | +| `send` | **Identity transform** on input. Send does not change the bundle; it executes it. The output (the assistant response) is recorded in `execution`, not in `resolvedInput`. | +| `fan` (the parent node itself) | Identity on input. The fan does not transform the bundle — it spawns N children, each of which transforms based on its slot. | +| **Fan child edge** | Applies `variant.payload` per axis: `attempt` is identity (slotIndex differentiates), `prompt` replaces last `user` message, `converter` appends `payload.converters` to its UserTurn child's pipeline, `target` rewrites the target downstream, `system_prompt` overrides upstream system message, `temperature` mutates target params at the next Send | +| `score` | Identity on bundle; reads from existing pieces | + +### 5.3 Hash function + +```ts +resolvedInputHash(node) = sha256( + parentHash || ":" || slotIndex || ":" || serialize(node.kind) || ":" || serialize(node.params) || ":" || serialize(variantPayload) +) +// `||` is string concatenation. `serialize` is canonical-JSON (sorted keys, +// stable null/undefined handling) so equivalent params hash equal. +// `parentHash` is the empty string for the root. +``` + +Cached on each node. This is what powers the `stale` detection in §6: when a parent's hash changes, the child's recorded `executionRecord.resolvedInputHashAtExecution` no longer matches its current `resolvedInputHash`, so the node is `stale`. Including `slotIndex` ensures the N children of `Fan(axis='attempt', n=5)` all hash differently and can be independently dirtied / refreshed. + +**Invalidation strategy: lazy on read.** The hash is **not eagerly recomputed** during the §6.3 edit-propagation walk (which would force an O(descendants) recomputation on every keystroke during text editing). Instead, edit propagation flips descendants' `state` to `stale` and clears their cached `resolvedInputHash` to `null`; the next read (by the renderer for stale-detection, or by the runner at dispatch time) lazily recomputes via the §5.3 hash function. The cached value is restored as a side effect of the read. This matches React's idiomatic memo-on-read pattern and avoids work the operator doesn't see. + +**In-flight edit race resolution.** If the operator edits an upstream node while a wave is in-flight, the runner's `setNodeState(running → clean)` on the affected descendant and the React state container's `setState(clean → stale)` from the edit race. **No atomicity guarantee is needed:** stale-detection is computed at render time from `currentHash !== execution.resolvedInputHashAtExecution`, and `currentHash` recomputes lazily after the edit propagated. The final visible state is `stale` regardless of which write lands first — the edit's hash invalidation is the deciding signal, not the order of state-machine transitions. Implementers should NOT add ordering guards; the lazy-hash mechanism is the race resolution. + +**`regenerateFanChildren` (§4.4 destructive op) preserves slot stability.** New children replacing deleted ones get fresh slot indices from `max(variants[].slotIndex ∪ deletedSlotIndices) + 1` per the §4.4 tombstone invariant — never recycled. This means a regenerated child's `resolvedInputHash` includes a different `slotIndex` than the deleted child's, so reflog entries from the deleted child cannot match the regenerated child by hash (correct: they are different nodes, not stale executions of the same node). + +## 6. Node Lifecycle & Propagation + +### 6.1 States + +```ts +export type NodeState = + | 'draft' // newly added; never executed (operator-facing label: "new" — see below) + | 'clean' // execution.resolvedInputHashAtExecution === current resolvedInputHash + | 'edited' // node was edited since last execution; needs re-run (renamed from 'dirty' in rev 14) + | 'stale' // self unchanged, but an ancestor was edited; needs re-run + | 'running' // execution in flight + | 'failed' // last execution returned an error + | 'cancelled' // last execution was cancelled by the operator before completion +``` + +**Operator-facing label for `'draft'` is "new" (rev 15).** Internal field name stays `'draft'` for code-grep stability, but the UI chip + hover tooltip read **"new"** (or "new (never run)" on hover) to avoid the operator-side mis-parse "this is a draft message I'm composing." The state means *the node has been authored in the tree but has never produced an execution* — nothing about composition state. The 02 §5 state-suffix legend (`○ new (never run)`) and any V1.0 surface that renders the state pill follow this label. + +**Naming note (rev 14):** the `'edited'` state was previously `'dirty'`. Renamed because `dirty` and `stale` read as near-synonyms to operators unfamiliar with git/build-system conventions; `'edited'` is the operator's own word for "I changed this" and pairs unambiguously with `'stale'` ("ancestor changed"). The state-noun pattern is preserved. Internal feature names like "dirty-edit guard" (\u00a713.1a) retain the older adjective \u2014 they predate the rename and naming "dirty-edit" stays clearer than "edited-edit." + +`cancelled` is distinct from `failed` because the operator-driven path back to `clean` is different: cancelled re-runs are expected and free of error metadata; failed re-runs should surface the prior error to the operator. + +### 6.2 Transitions + +```mermaid +stateDiagram-v2 + [*] --> draft: addNode() + draft --> running: refresh() + clean --> edited: editParams() + clean --> stale: ancestorEdited() + edited --> running: refresh() + stale --> running: refresh() + failed --> running: refresh() + cancelled --> running: refresh() + running --> clean: execution.outcome=success + running --> failed: execution.outcome=error + running --> cancelled: cancel() +``` + +### 6.3 Propagation rules + +These are the heart of the "opt-in propagation" the user asked for. The git mental model in §6.8 names this same machinery in operator-friendly terms: `refreshSubtree` is surfaced in the UI as **Refresh subtree** (conceptually a rebase), an edit makes downstream nodes need a refresh, and the operator opts in node-by-node or subtree-at-a-time. + +1. **Edits propagate immediately but inertly.** When `editParams(node)` runs: + - `node.state` ← `edited` + - For every strict descendant `d`: if `d.state ∈ {clean, cancelled, failed}` then `d.state ← stale` (and `d.execution ← null` for `failed` descendants per §6.4.1). The operator's refresh signals "give the subtree a clean slate," which covers failures whose root cause may have been the now-changed upstream. `running` descendants are ignored — they will recompute their hash on completion and re-evaluate. + - **No execution is triggered.** +2. **Refresh has three scopes, each precisely defined:** + - `refreshNode(id)` — re-execute *this single node only*, regardless of kind: + - `root_prompt` / `import_message`: re-hydrate the seed bundle (no API call for `root_prompt`). + - `user_turn` / `score`: recompute `resolvedInputHash`; no API call. Transitions to `clean` immediately if upstream is `clean`. + - `send` (leaf): one dispatch sequence per [03 §3.3](03_runner.md#33-dispatch-step-leaf-sendnode--partition--create_attack--sequential-add_message-calls) — `create_attack` + N `add_message`s for the leaf's stale Sends (with N=1 if only the leaf itself is stale). + - `send` (interior, i.e. has a `send` descendant): **V1.0 treats this as a structural alias for `refreshSubtree(id)` restricted to descendant leaves.** Per [03 §3.2](03_runner.md#32-what-gets-dispatched), interior Sends never appear independently in the `ready` queue — every dispatch is anchored on a leaf. Operator semantics: "refresh this Send" means "regenerate this Send and everything downstream of it that depends on it"; the runner picks the descendant leaves and dispatches their full sequences (which re-fire this Send as part of each leaf's fresh suffix). The reason for the alias: a single `add_message` against the existing interior AR would re-fire only the target call, but the per-leaf ARs downstream still reference the OLD interior assistant pieces in their `prepended_conversation`; the leaves would render stale after a "single-Send refresh" succeeded. The alias guarantees consistency at the cost of re-firing the chain. V1.1 may optimize via `add_message`-against-existing-AR for the "extend a clean leaf by one turn" hot-path, but the single-Send-refresh case is not on the V1.1 cut surface — operators who want surgical regeneration use `branchFromNode` to scope. + - `fan`: **V1.0 aliases this to `refreshSubtree(id)`** for the same reason interior-Send refresh aliases to subtree-refresh (the rule above): a fan's direct children are typically `user_turn` nodes (the operator's per-variant prompt or attempt input), and "refreshing" a `user_turn` is a no-op state recompute that dispatches no target calls. Aliasing to `refreshSubtree(fan_id)` walks every Send descendant under the fan and dispatches them — which is what the operator means by *"Refresh all children"* on the [02 §2.2 fan action rail's `↻`](../doc/gui/design/02_tree_ui_affordances.md#22-per-node-action-rail). Previously this case was *"no-op on the parent itself, plus `refreshChildren(id)` semantics"* which produced zero target calls when children were `user_turn`s; reviewer rev-16 caught the tooltip/behavior mismatch. *It does not regenerate the child set* (that is `regenerateFanChildren`, a separate destructive op). + + **Recursion termination on Sends (legacy, retained for reference).** The earlier `refreshChildren(id)` framing walked **only direct children** and bottomed out at leaf Sends. Under the rev-16 alias-to-subtree rule above, this is now redundant — `refreshSubtree(fan_id)` is the canonical implementation — but the property still holds: every traversal initiated by `refreshNode(fan_id)` terminates because fans cannot have fan children in V1.0 (fans expand into a layer of Send/user_turn nodes, never directly into another fan; see [§9.3](#93-migration-of-existing-linear-attacks---auto-reverse-to-a-tree)). + - `refreshSubtree(id)` — re-execute this node, then walk descendants in topological order; each transitions `edited/stale/failed/cancelled → running → clean/failed/cancelled`. + - `refreshTree()` — equivalent to `refreshSubtree(root)`. +3. **Idempotency.** Refreshing a `clean` node is a no-op (no API call, no state change). +4. **Concurrency budget.** `refreshSubtree` accepts an optional `maxParallel` (default 4). **Budget is per-Workspace, shared across all open conversation trees** (§12.2 / §13). The runner has a single dispatch queue per Workspace; when picking the next ready leaf, it uses fair-share scheduling — preferring whichever tree's active wave has the fewest in-flight calls — so a large refresh on tree A does not starve a small refresh on tree B. *Future:* per-target sub-budgets to match target-specific RPM limits surfaced in `TargetCapabilitiesInfo.max_requests_per_minute`; noted in §12.2 but not on the immediate roadmap. +5. **Failures isolate, but block descendants.** A failed node does not stop sibling branches. Its descendants remain `stale` (they cannot proceed without a parent result); they become refreshable as soon as the parent succeeds. The runner surfaces `{ succeeded, failed, blocked, cancelled }` counts at the end of a subtree refresh. + +### 6.4 Failure & partial-commit semantics + +Three failure modes need distinct handling: + +| Mode | Detection | Behaviour | +|---|---|---| +| **Per-node failure** (target returned an error, validation rejected the message) | `add_message` raises or returns `response_error != 'none'` | Node transitions to `failed`; sibling branches continue; descendants stay `stale`. **The runner nulls `node.execution`** so that retry (§6.4.1 below) treats the node as needing fresh dispatch. The error message is captured separately on `node.lastError` (operator-visible in the drawer); the previous execution is **not** appended to `executionHistory` because it never completed. Operator can `refreshNode` or `editParams` to retry. | +| **Mid-subtree cancellation** (operator clicks "Stop") | Runner checks `cancellationToken` between dispatches | In-flight `send`s complete (no abort token in the backend route today; in-flight nodes are committed when their HTTP call returns). Not-yet-dispatched nodes transition `running → cancelled` immediately (and likewise null `node.execution` if they were previously holding one). Already-completed nodes remain `clean`. | +| **Tab crash / reload mid-refresh** | On reload, runner scans for `running` nodes | The reload-reconstruction path (§9.4.1) re-runs auto-reverse from backend state, which only sees committed leaves; mid-flight wave state is lost. Already-completed leaves restore correctly because `recordExecution` writes happen on success only. V2 server-side conversation tree storage will demote orphan `running` nodes back to `edited` / `stale` by checking which `executionId`s persisted. | + +#### 6.4.1 Why `node.execution = null` on failure (not preserved) + +A failed dispatch never produced a coherent `ExecutionRecord` for the node. Holding the prior execution after failure would: + +1. **Corrupt retry context.** The runner's resolver ([03 §4.1](03_runner.md#41-the-resolved-root-to-leaf-path--prepended-fresh_suffix)) reads `node.execution` to decide whether a Send is in the clean prefix or fresh suffix. If a failed Send retained its prior-wave execution, the resolver would load the prior wave's stale assistant pieces into the new AR's `prepended_conversation`, making the target see fabricated context. +2. **Confuse the visual state.** Operators read `node.execution` for the "this Send has output" affordance. A failed Send presenting a non-null execution invites the operator to inspect it as if the latest attempt succeeded. + +**Trade-off accepted: the partial-AR pointer is lost for V1.0.** For mid-chain failures (§3.3 of 03), the AR exists on the backend with the prefix turns that did succeed; the operator can find it in History via `labels.conversation_tree_id` + `wave_id` (it shows as a partial row). What's lost is the runner's ability to fast-path a retry by skipping `create_attack` and the already-succeeded `add_message`s. V1.1 may add a per-Send `partialAttackResultId: string | null` field for that fast-path (see [03 §7 rule 5](03_runner.md#7-failure--partial-commit-semantics)); V1.0 retries always re-pay `create_attack`. + +### 6.5 Branch from node - the immutable-history primitive + +> **Version scope (revision 9).** The **always-new-tree variant of `branchFromNode` ships in V1.0** alongside a minimal-Workspace data model (§13 V1.0 variant): single-tree visible, no tab strip; `branchFromNode` swaps the active tree to the new clone, with the source tree re-openable from History via auto-reverse (§9.3). The **sibling-subtree-in-same-canvas variant** stays V1.1 (V1.0 ships its disabled-stub `🌿` button per [02 §2.2](02_tree_ui_affordances.md#22-per-node-action-rail) — slot reservation against UX regression). The V1.0 cut surface ([01 §1 V1.0 explicit exclusions](#v10-explicit-exclusions-deferred-to-v11)) reflects this: cut #2 is reduced to "sibling-subtree variant only." +> +> *Why this revision flipped:* the previous revision deferred all of `branchFromNode` to V1.1, leaving V1.0 operators with no in-tree way to "preserve the original" — they had to context-switch to the chat tab's "Branch into new attack." For the most-common operator motion ("this prompt didn't work, let me edit and try again without losing what I have"), the context switch is wrong. The minimal-Workspace data model is ~30 LOC of React state plus a "Switch tree" button in the canvas-level ribbon ([02 §2.3](02_tree_ui_affordances.md#23-canvas-level-affordances)); the cost is well below the operator-UX win. + +The concept "branch from a node" is exposed as **two distinct API functions**, each shipping in its own version. Earlier revisions used a single `branchFromNode(nodeId)` with implicit landing-mode at the call site; revision 14 splits them per reviewer guidance so the call site is forced to be explicit about which behavior it wants (the two have different return types, different version-scope, and different downstream invariants). + +```ts +// V1.0 — always-new-tree variant. +// Returns the new ConversationTree's id; the new tree contains a deep copy of the +// root→nodeId path + nodeId's descendants. Siblings of any node on the root→nodeId +// path are NOT copied. All cloned nodes initially reference the same backend +// ExecutionRecords (no execution cost, no token cost). +// V1.0 landing: swaps the Workspace's currentTree to the new clone; source tree is +// re-openable from History via auto-reverse (§9.3). +// V1.1+ landing: opens as a new tab in the Workspace tab strip (source stays +// foregrounded if operator prefers). +function branchToNewTree(nodeId: ConversationTreeNodeId): ConversationTreeId + +// V1.1 — sibling-subtree-in-same-canvas variant. +// Returns the new subtree's root NODE id (not a tree id) — the cloned slice lands +// as a sibling within the SAME ConversationTree, sharing the source tree's id. +// The new subtree renders with a distinct edge style (dashed "branch" label) to +// disambiguate from fan edges that already express "multiple paths from one +// ancestor." See "Two landing modes" below. +function branchToSubtree(nodeId: ConversationTreeNodeId): ConversationTreeNodeId +``` + +Both share a private `_deepCopySubtree(rootNodeId)` helper that does the path-plus-descendants deep copy with fresh `ConversationTreeNodeId`s; the divergence is only in the landing step (which is exactly the version-scoped piece). The shared helper guarantees the two variants produce structurally identical clones modulo where they end up. + +**Why two functions instead of `branchFromNode(id, { landingMode })`:** the two operations differ in (a) return type — tree id vs. node id, (b) version-scope — V1.0 vs. V1.1, (c) downstream invariants — the new tree has its own `conversation_tree_id` and gets `parentConversationTreeId` set, while the new subtree shares the source tree's id and is part of the same render canvas. Hiding this behind a flag invites silent call-site bugs (operator clicks the V1.1 button in a V1.0 build and gets a tree swap "for free" because the flag defaulted). Two explicit functions force every consumer to choose, fail loudly on the wrong choice, and version-cleanly: V1.0 only exposes `branchToNewTree`, V1.1 adds `branchToSubtree` as a non-breaking extension. + +**"branchFromNode" as concept name** persists in operator-facing docs and the git mental model (§6.8: "branch from a node — git equivalent of `git branch `"); the function-name split is purely the API surface. + +Conceptually, given the tree: + +``` +R --- A + \- X --- B + \- C +``` + +`branchToNewTree(X)` (V1.0) produces a new ConversationTree: + +``` +R' --- X' --- B' + \- C' +``` + +Note that A is **not** carried over — only the root-to-X path plus X's descendants. R' and X' carry the same `kind` and deep-copied `params` as R and X but with fresh `ConversationTreeNodeId`s. Every cloned node's `execution` field initially points at the same backend `ExecutionRecord` (and the same `executionHistory` entries) as its source node. The clone's nodes start in `clean` state because their `resolvedInputHash` still matches their referenced execution's `resolvedInputHashAtExecution`. + +**Sharing semantics — what is shared vs. per-tree (revision 10).** The phrase "share execution refs" above is precise: `ExecutionRecord` *objects* are immutable and may be shared across cloned trees, but each clone gets its **own `executionHistory` array** (shallow copy of the array at clone time; the array elements are shared `ExecutionRecord` refs). This matters because: + +- **Reflog evictions are per-tree.** A `REFLOG_CAP_PER_NODE` eviction in tree A's node X removes the entry from A's `executionHistory` array; B's clone X' still holds the ref. The underlying `ExecutionRecord` object remains in memory as long as B references it. +- **`pinExecution` is per-tree.** Pinning in A does not pin in B; the pin flag lives on the per-tree `executionHistory` entry, not on the `ExecutionRecord` object. +- **`makeCurrent` is per-tree.** Promoting an entry in A swaps A's `execution` pointer; B's `execution` is untouched. +- **The `ExecutionRecord` itself is treated as immutable.** Once written by the runner, its fields (`attackResultId`, `pieceIds`, `resolvedInputHashAtExecution`, `waveId`, `attemptedAt`, etc.) never change. Any operation that "modifies" the execution actually allocates a fresh `ExecutionRecord` and updates the per-tree pointer. + +Implementation: when cloning, `clonedNode.execution = sourceNode.execution; clonedNode.executionHistory = [...sourceNode.executionHistory]` (shallow array copy). Sharing the array element refs is fine; sharing the array reference itself would couple the two trees' reflog state and is the bug to avoid. + +**No backend calls fire at branch time.** This is the git equivalent of `git branch new ` — cheap, refs only. Cost is one `ConversationTreeNode` allocation per node copied, plus the same number of edges. For a typical 30-node path-plus-descendants slice, ~60 small object allocations. + +**Divergence is purely operator-driven.** The clone's nodes stay clean until the operator edits one. That edit: +1. Marks the edited node `edited` (its `resolvedInputHash` changed). +2. Marks all descendants `stale` (their ancestor changed) per §6.3 rule 1. +3. The next refresh on the clone produces fresh `ExecutionRecord`s pointing at brand-new `AttackResult`s under the new tree's fresh `conversation_tree_id`. The original tree is **never touched.** + +**UI affordances (V1.0 ships `📋`; V1.1 adds `🌿` — specified in [02_tree_ui_affordances.md §2.2](02_tree_ui_affordances.md#22-per-node-action-rail)):** + +- Per-node `📋` icon. Tooltip: **"Branch from here"** on any non-root node; **"Clone tree"** on the root node (where `branchToNewTree(root)` is the degenerate case — the clone is structurally identical to the source). **Ships V1.0** (single-tree Workspace; clicking swaps the active tree to the clone). +- Per-node `🌿` icon for the sibling-subtree variant (see "Two landing modes" below). Tooltip: **"Branch as subtree (same canvas)"**. **V1.1** (V1.0 renders disabled stub per [02 §2.2](02_tree_ui_affordances.md#22-per-node-action-rail)). *Visually distinct from `📋`* (branch-glyph vs. clipboard-glyph) so operators don't mistake them when both render. +- Right-click context menu offers the same labels plus their git aliases. +- The canvas-level ribbon offers "Clone tree" + "Switch tree" entry points (V1.0); V1.1 adds the tab strip. + +**V1.0 landing semantics:** clicking `📋` opens the cloned tree as the Workspace's `currentTree`; the source tree drops from the canvas but is **re-openable from History** via "Open as tree" (auto-reverse from §9.3, filtered by the source's `conversation_tree_id`). The §9.4.1 reload-reconstruction path means a re-opened source tree comes back with all completed leaves intact; only edits-since-last-Refresh from the original session are lost. The §13.1 V1.0 Workspace section names the affordances. + +**Two landing modes** (V1.0 ships #1 via swap; V1.1 ships both — #1 via tab strip, #2 in-canvas): the operator clicks one of two adjacent icons on the per-node action rail, which invoke distinct API functions per the split above. + +1. **New tree** — `branchToNewTree(nodeId)`. V1.0: swap Workspace `currentTree` to the clone (source re-openable from History). V1.1: open as a new tab in the §13 tab strip; operator switches between source and clone via the strip. +2. **Sibling subtree in the same canvas** — `branchToSubtree(nodeId)` (`🌿` icon, V1.1 only). The cloned slice lands as a sibling of the source node within the *same* ConversationTree canvas, sharing the source's root. Operator sees both side-by-side without tab-switching. Useful for "let me try this prompt slightly differently and compare on one screen." + +The mode-2 variant was rejected in revisions 4-6 because it visually collided with fan-outs at the same canvas position. The V1.1 reintroduction depends on a small render-rule disambiguation (sibling subtrees from `branchToSubtree` render with a distinct edge style — dashed + labeled "branch" — vs. solid fan edges). The disambiguation is small and not in V1.0's critical path, hence the V1.1 timing. + +**Pursuing N parallel paths** (the "both attempt #3 AND attempt #7 are worth exploring" use case) is `branchToNewTree(treeRoot)` twice, then set a different `promotedChildSlotIndex` in each clone. V1.1 operators flip between the two tabs to compare; **V1.0 operators flip between two browser tabs** — each browser tab holds one Workspace `currentTree`, and the §9.4.3 `BroadcastChannel` advisory lock keeps the two tabs from racing the runner. ExecutionRecords are shared between clones until divergence. + +### 6.6 ExecutionHistory GC (the reflog) + +In the git mental model (§6.8), `executionHistory` is the **reflog** for a conversation tree node: a bounded log of past tips of the per-node ref, used to recover from accidental re-runs and to support "checkout a past run" (detached HEAD). It would grow without bound under heavy re-running, so V1 caps it. + +Each entry is a `ReflogEntry` (per §4.6) — `{ execution: ExecutionRecord, pinned: boolean }`. The `ExecutionRecord` is immutable and may be shared with other trees (per §6.5); the `pinned` flag is per-tree, so pinning in tree A does not affect tree B's view of the same underlying execution. + +- **Default cap `REFLOG_CAP_PER_NODE = 50` per node**, evicting oldest-first (FIFO) over unpinned entries. Bumped from 10 in revision 9 — at ~10 KB per ExecutionRecord and 60 leaves, 50 entries = ~30 MB worst case, which is cheap relative to typical browser-tab memory budgets and covers the "11 refreshes in a row" operator scenario that the previous cap of 10 silently broke. +- **The cap is a Workspace setting**, not a global constant. Operators with memory-constrained sessions can lower it; operators with deep-exploration workflows can raise it (up to a hard cap of 200 to keep React rendering responsive). The setting lives in the `Workspace` type (§13.1) alongside the cost-guardrail threshold. +- **Eviction is operator-visible.** When the next push to `executionHistory` would evict an unpinned entry, the runner emits a `WaveEvent` of kind `reflog_eviction` with the evicted execution's `executionId` and a one-line preview. The canvas-level ribbon shows a transient inline marker — *"Past run evicted from node X. [Pin evicted run] [Increase cap]"* — for ~8 seconds. The marker dismisses cleanly so it isn't a modal interrupt; operators who genuinely want every past run know to either pin or raise the cap. *Operator-facing terminology* uses "past run" (the friendly-first convention from [02 §7 Q.7.A](02_tree_ui_affordances.md#7-decisions-and-open-questions)); "reflog" stays in code, data-model docs, and the right-click git-alias menu. +- The evicted `ReflogEntry` is dropped from this tree's reflog. The underlying `ExecutionRecord` may still be referenced by another cloned tree (sharing per §6.5), in which case it stays in memory; otherwise it becomes garbage-collectible. The backend `MessagePiece`s remain regardless (append-only). The leaf AR is still queryable in History via its `labels.wave_id` + `labels.conversation_tree_id`. +- **Operator-facing affordance: `pinExecution(treeId, nodeId, executionId)`** — flips the `pinned` flag on the matching `ReflogEntry` in tree `treeId`'s node. Pinned entries do not count against the cap and are not evicted. The flag is per-tree per-execution; pinning entry E in tree A leaves the same shared `ExecutionRecord` in tree B's reflog unpinned. The runner's `RunnerStateSink` exposes `setReflogPinned(treeId, nodeId, executionId, pinned)` for the UI to call. +- **Out of scope for V1:** purging the backend `MessagePiece`s when a conversation tree node is deleted. We treat backend storage as the audit log and never delete from it. + +### 6.7 `makeCurrent` - destructive promotion from the reflog + +The operator's path into a past execution begins with **Checkout this run** (the detached-HEAD analog, see §6.8): they select an entry from the node's reflog (`executionHistory`) and the node enters detached rendering for read-only inspection. From there, **Make current** is the destructive step that promotes the past run back to be the node's current execution. This is the `git reset --hard ` analog. + +```ts +function makeCurrent(nodeId: ConversationTreeNodeId, executionId: string): void +// Pre-condition: executionId must be present in node.executionHistory. +// Post-conditions defined below. +``` + +**Steps (precise):** + +0. **Pre-condition guard.** If `node.execution` is `null` (the node is currently in `failed`/`cancelled` state with no committed run per §6.4.1), step 1 has nothing to move — skip it. The promoted entry simply becomes `node.execution` without a swap; `executionHistory` shrinks by one. This is the **failed-node makeCurrent path**: operator selects a past successful run from the reflog (which is non-empty even when current `execution` is null — see [02 §8.1a detached-on-failed](02_tree_ui_affordances.md#81a-detached-head-on-a-failed-node-v10)) and promotes it; the node transitions from `failed` to `clean` without disturbing the reflog beyond removing the promoted entry. The `node.lastError` field clears as part of step 3. +1. The current `node.execution` is moved to the head of `node.executionHistory` (the position vacated by the promoted entry). **Skip if `node.execution` is null** (per step 0). +2. The promoted past-run becomes `node.execution`. +3. `node.state` ← `clean` (the node is consistent with its new current). +4. For every strict descendant `d`: if `d.state ∈ {clean, cancelled, failed}` then `d.state ← stale` AND `d.execution ← null` for `failed` descendants (per §6.4.1 — clearing the stale execution lets the retry-on-refresh path treat the node as a fresh dispatch). `running` descendants are ignored — they will recompute their hash on completion and re-evaluate. **`failed` is in the demotion set** (the V1.0 design includes it; earlier framings that excluded it left operators with a `failed` subtree that wouldn't retry after `makeCurrent`, requiring manual clearing of each failure — operator-hostile by silence). The makeCurrent operator action is "the upstream is different now, give the subtree a clean slate"; that includes failures whose root cause may have been the now-displaced upstream. +5. The node exits detached rendering. +6. **No wave is generated by `makeCurrent` itself.** It's a pure pointer swap with no ExecutionRecord write. The operator's subsequent `refreshSubtree` to re-run the now-stale descendants is the wave-generating event, and it carries `waveTriggerKind = 'refresh_subtree'`. There is no `'make_current'` enum variant (per §14.4 note). + +**Why descendants stale-cascade (Option A, not orphan or untouched).** Faithful to the §6.3 invariant that no `clean` node has a edited/stale ancestor. The operator's mental model is already "upstream changes -> descendants stale" from `editParams`; `makeCurrent` is just another way to change upstream content, so it follows the same rule. Alternatives considered and rejected: (B) a new `orphaned` state would require a new lifecycle entry for one operation — overengineered; (C) leaving descendants `clean` would violate the §6.3 invariant and confuse operators who'd see a clean node sitting under a node that just changed. + +**Reflog stays bounded.** Step 1 puts the displaced current into the head of `executionHistory`. The promoted entry, which was already in the reflog, is no longer there (it's now `execution`). Net length is unchanged. If the reflog was already at the cap (`REFLOG_CAP_PER_NODE`, default 50 per §6.6) and step 1 would push it past, the oldest unpinned entry is evicted per §6.6. + +**Pinned past-runs are not disturbed.** If the operator pinned an entry to prevent eviction (§6.6 `pinExecution`), the pin survives a `makeCurrent` of a *different* entry. Only the displaced current goes into the (potentially capped) part of the reflog. + +**UI affordance.** "Make current" is a button in the right-side drawer's reflog tab, surfaced only when the node is in detached state and the selected past run differs from the current execution. Confirmation modal: *"This will replace the current run. The previous run will move into the reflog. Descendants will become stale and need a refresh."* + +### 6.8 Git mental model (for operator vocabulary) + +The lifecycle and propagation rules in §6.1-§6.6 are mechanically straightforward, but new operators tend to grasp them faster when framed as git. The full data-model framing ("each tree is a worktree, the workspace is the repository") is in §13; this subsection covers the lifecycle vocabulary. + +| Git concept | PyRIT tree-view equivalent | Fit | +|---|---|---| +| Object store (commits, trees, blobs) | Backend `AttackResult` + `MessagePiece` rows (append-only) | Strong (mapping is exact) | +| Commit | One `ExecutionRecord` (AR + conversation + pieceIds, content-addressed by `resolvedInputHashAtExecution`) | Strong | +| Reflog | `executionHistory: ReflogEntry[]` on a conversation tree node (each entry wraps an `ExecutionRecord` with a per-tree `pinned` flag, §4.6) | Strong | +| Branch ref pointing at HEAD | `execution: ExecutionRecord \| null` on a conversation tree node | Strong | +| **Worktree** | One **ConversationTree** (a tree view canvas instance), see §13 | **Strong** (each worktree has its own HEAD; ours has many HEADs, one per leaf Send) | +| **Workspace / repo root** | The set of all conversation trees the operator currently has open | **Strong** (each conversation tree has its own `conversation_tree_id`; the workspace is the React state container) | +| `git rebase` (rebuild on top of new upstream) | `refreshSubtree` — surfaced in UI as **"Refresh subtree"** (conceptually a rebase) | Strong | +| `git cherry-pick` | The Stack "Pick" operation (`FanNode.params.promotedChildSlotIndex`) | Strong | +| `git branch foo` / `git worktree add ../foo bar` | `branchToNewTree(nodeId)` (V1.0/V1.1) and `branchToSubtree(nodeId)` (V1.1) (§6.5) - UI label is "Clone tree" on root, "Branch from here" otherwise | Strong (cheap; refs only) | +| `git checkout ` (detached HEAD) | Selecting a past `ExecutionRecord` for display only | Strong (V1 "checkout past run" is non-destructive) | +| `git reset --hard ` | Explicit "Make current" affordance on a past run (§6.7) | Strong (destructive op, opt-in; descendants stale-cascade) | +| `git log ` | History tab filtered by `labels.conversation_tree_id` | Strong | +| `git rebase` semantics (rewrites history; old commits unreachable) | Our refresh is **non-destructive**: old `ExecutionRecord`s stay in `executionHistory`, old ARs stay in the backend keyed by `conversation_tree_id` | **Loose** (intentional: less destructive than git) | +| `git merge` / fast-forward | None in V1 (no DAG merge). V2 `best_of` aggregation is fan-in, not merge. | Out of V1 | +| `git push` / `git pull` | None in V1 (client-only conversation trees). V2 server-side conversation trees introduce these. | Out of V1 | + +**What this means for the design:** + +1. **Friendly verbs in primary UI; git terminology for execution-history concepts only.** Button labels stay close to the API surface — `Refresh node` / `Refresh subtree` / `Refresh tree` — to keep the operator-to-implementation mapping obvious. Git terminology surfaces for the concepts that have no equally-concise English equivalent: "Reflog" or "Past runs" instead of "Execution history"; "Checkout this run" instead of "Switch to past execution"; "Make current" instead of "Promote past execution"; "Cherry-pick" on Stack picks; "Clone tree" / "Branch from here" for `branchToNewTree` (§6.5). The conceptual model in the table above — *refresh-subtree is conceptually a rebase* — survives in tooltips and teaching prose, but is not a button label. Decision recorded against [02 §7 Q.7.A](02_tree_ui_affordances.md#7-decisions-and-open-questions); earlier revisions proposed git verbs as primary button labels ("Rebase" instead of "Refresh subtree"), reverted V1.0-decision so button labels match the API surface verbatim. +2. **Keep underlying labels as-is**: `conversation_tree_id` stays `conversation_tree_id`. Renaming it `branch_id` or `worktree_id` would be misleading — operators see "worktree" in UI text but the JSON key is `conversation_tree_id`. +3. **Detached HEAD is a real state**: when the operator selects a past `ExecutionRecord` from a node's reflog for inspection, the node enters "detached" rendering (dotted border, banner). Re-running while detached creates a new tip and exits detached state (default; equivalent to `git checkout -b` + commit, not `git commit` while detached — we never make commits unreachable). UI spec in [02_tree_ui_affordances.md §7](02_tree_ui_affordances.md#7-decisions-and-open-questions). +4. **No structural merge**: trees do not merge in V1; even V2 `best_of` aggregation is a fan-in (one consumer reads N producers), not a structural merge of two conversation trees. + +The rest of §6 (states, transitions, propagation, failures, branching, GC) is the implementation. The git framing in this subsection is operator-facing language; the code keeps the technical names from §6.1-§6.6. + +### 6.9 Node-editor undo (V1.0) + +Operators editing a `UserTurnNode`'s text get native Ctrl-Z inside the textarea (browser-provided, unchanged). **Structural** edits — add a node, delete a node/subtree, edit a node's params, regenerate fan children, makeCurrent — had no recovery path before rev 15. The §9.4.2 `beforeunload` guard catches reload, the §13.1a dirty-edit modal catches tree-swap, but neither helps an operator who deleted the wrong subtree and wants it back. Rev 15 adds a small per-tree in-memory undo stack so Ctrl-Z (or Cmd-Z on macOS) inside the canvas pops the last structural edit. + +**Mechanism: per-tree inverse-op stack.** Each mutating op pushes its inverse onto `tree.undoStack: UndoOp[]`; Ctrl-Z pops and applies the inverse. Each variant snapshots the *affected-node-set state* (not just params/execution) so the inverse fully reverses the op's downstream cascade: + +| Op | Snapshot stored on push | Inverse (applied on Ctrl-Z) | +|---|---|---| +| `addNode(n, parent)` | `nodeId` + `autoInsertedChildIds[]` (e.g., the auto-inserted `Send` child when adding a `UserTurn`) | Delete all snapshotted ids | +| `deleteNode(n.id)` | Full subtree (`nodes[]` + `edges[]` + parent edge) | Re-graft the subtree at `parentId` | +| `editParams(n.id, oldParams, newParams)` | `nodeId` + `oldParams` + `priorState` (the node's state before §6.3 rule 1 fired) + `priorDescendantStates: Map` (every descendant the rule re-staled) | Set `params = oldParams`; restore node `state = priorState`; restore each descendant's state from the map | +| `regenerateFanChildren(fanId, ...)` | `fanNodeId` + `oldChildren[]` + `oldChildEdges[]` (per-child execution refs included) | Replace the fan's current children with the snapshotted set | +| `makeCurrent(n.id, ...)` | All [§6.7](#67-makecurrent---destructive-promotion-from-the-reflog) step-4 affected state: `priorExecution` (`null` valid per §6.7 step 0) + `promotedExecution` (the one that was elevated; move back to reflog) + `priorDescendantStates: Map` + `priorDescendantExecutions: Map` (every descendant whose execution §6.7 step 4 nulled) | Restore node execution + walk every descendant and restore both state and execution from the maps | + +**Callsite ordering for snapshots (V1.0).** Every mutating op MUST snapshot the affected state **before** applying the mutation (since §6.3 rule 1 and §6.7 step 4 are themselves the mutators of `priorState`/`priorDescendantStates`). Implementation: each op's wrapper function captures the snapshot first, runs the underlying mutator, then pushes the `UndoOp` onto `undoStack`. Failing to follow this order produces an undo that "restores" the post-mutation state — silently broken. + +**`UndoOp` typedef:** + +```ts +export type UndoOp = + | { + kind: 'add' + nodeId: ConversationTreeNodeId + autoInsertedChildIds: ConversationTreeNodeId[] + } + | { + kind: 'delete' + subtreeSnapshot: ConversationTreeNode[] + edgesSnapshot: ConversationTreeEdge[] // edges within the subtree + the parent-attach edge + parentId: ConversationTreeNodeId + } + | { + kind: 'editParams' + nodeId: ConversationTreeNodeId + oldParams: NodeParams // discriminated by the node's kind; the inverse writes back over the current params + priorState: NodeState // restore on undo (NOT just the params \u2014 §6.3 rule 1 mutated state too) + priorDescendantStates: Map // every descendant the §6.3 rule re-staled + } + | { + kind: 'regenerateFanChildren' + fanNodeId: ConversationTreeNodeId + oldChildren: ConversationTreeNode[] + oldChildEdges: ConversationTreeEdge[] + } + | { + kind: 'makeCurrent' + nodeId: ConversationTreeNodeId + priorExecution: ExecutionRecord | null // §6.7 step 0: null is a valid prior (failed-node makeCurrent path) + promotedExecution: ExecutionRecord // the run that was promoted; move back to reflog on undo + priorDescendantStates: Map + priorDescendantExecutions: Map + } +``` + +**Snapshot size bounds.** Per-op cost: +- `add` / `editParams` (params-only): O(1) on the node itself; `editParams.priorDescendantStates` is O(descendants). +- `delete` / `regenerateFanChildren` / `makeCurrent`: O(subtree size) — the snapshot is bounded by the affected subtree, not the whole tree. + +At N=20 stack cap × 60-node trees worst case ≈ ~1200 node snapshots in memory ≈ ~12 MB at typical PyRIT node sizes. Acceptable for V1.0; flagged for the V1.x configurable-cap follow-up if operators report memory pressure on very-large trees (per [§1.2 known limitations](#12-v10-known-limitations-sharp-edges-in-what-v10-does-ship)). + +**Why state-snapshot widening (rev 16, per reviewer Findings 6 + 7).** The original §6.9 inverse-table from rev 15 stored only `(oldParams, newParams)` for `editParams` and `(oldExecution, newExecution)` for `makeCurrent`. Both inverses were structurally lossy because the underlying ops mutate more than the named fields — `editParams` triggers the §6.3 rule 1 cascade that stales every clean descendant; `makeCurrent` triggers the §6.7 step 4 cascade that stales + nulls executions on every strict descendant. Re-applying the named-field inverse without the state-snapshot restoration left descendant nodes in `stale` with stale `lastError` strings — Ctrl-Z visually "did something" but the operator's tree was still half-broken. The state-snapshot widening adds bounded per-op memory in exchange for honest undo semantics. Rejected alternatives: (a) "trivial full-tree-snapshot per op" — 12MB → ~200MB at the same N=20 × 60-nodes worst case; (b) "document the limitation in §1.2 and ship partial undo" — operator-trust cost of half-working Ctrl-Z is bigger than the memory cost. + +**Cap and lifecycle.** Stack cap is hard-coded at **N = 20** for V1.0; eviction is FIFO over the oldest entry when a 21st push lands. Stack is **per-tree** — cleared on `openTree`, `newTree`, `closeTree` (the tree-swap operations that drop the source). **`branchToNewTree` carries the source's `undoStack` into the clone** (per [§13.1](#131-v10-minimal-workspace)) — the carried `edited` state needs corresponding undo entries to be reachable, otherwise an accidental 📋 click would silently lock in every pre-click structural edit. Reload loses it (same contract as edits-since-last-Refresh per [§9.4.1](#941-reload-reconstruction-v10)). No persistence to sessionStorage in V1.0 (avoids another schema-versioned key under [§13.1 schema versioning](#131-v10-minimal-workspace); operators who reload lose undo state as expected). + +**Key binding (avoid stealing native input undo).** The Ctrl-Z handler is registered on the react-flow `` element's `onKeyDown`, NOT on `window`. When a textarea or input has focus inside a node card, the key event bubbles to native handling first (typing-level undo). When focus is on the canvas (no input focused, or operator pressed Esc to blur the input), Ctrl-Z reaches the structural-undo handler. Operators editing text and wanting structural undo press Esc first to blur, then Ctrl-Z; documented in tooltip. + +**What's NOT in scope (V1.0):** + +- **Redo** (Ctrl-Shift-Z). V1.x adds a symmetric redo stack if operators report needing it; the inverse-op model already supports it (each `Ctrl-Z` pop would push the original op onto a redo stack, cleared on next non-undo edit). +- **Wave/refresh undo.** Refresh waves produce new backend `AttackResult`s that are append-only; undoing a wave at the runner layer would not delete its ARs, only restore tree-side state. Operator recovery for an unwanted wave's effect is the [§6.7 reflog `makeCurrent`](#67-makecurrent---destructive-promotion-from-the-reflog) workflow — surgical, AR-aware, already shipped. +- **Persistent undo across reload.** Out of V1.0; reload loses the stack. Matches the V1.0 reload-loss contract. +- **Configurable cap.** Hard-coded N=20 in V1.0; V1.x moves it to `WorkspaceSettings.undoCap` once operator usage signals the cap is wrong. + +**V1.0 known-limitation cross-reference:** [§1.2](#12-v10-known-limitations-sharp-edges-in-what-v10-does-ship) names the in-memory + per-tree + no-redo trade-offs so operators reading the cut surface see the boundaries. + +## 7. Mapping to the Existing Backend + +V1 needs **zero new endpoints**. The materialization rule is also simpler than revision 2 thanks to the AR-per-leaf decision (§12.1): the runner uses `CreateAttackRequest.prepended_conversation` ([attacks.py:L238-L239](../../../pyrit/backend/models/attacks.py#L238-L239), capped at 200 messages - plenty of headroom for V1) to inject the upstream context, and every leaf is a sovereign `AttackResult`. + +### 7.1 ConversationTree operation → backend call + +| ConversationTree operation | Backend call | Notes | +|---|---|---| +| Refresh a leaf `SendNode` | `POST /attacks` with `prepended_conversation` = resolved clean prefix (root→leaf, clean Sends only), `labels.conversation_tree_id` set; then **N `POST /attacks/{new_id}/messages` calls** in sequence, one per stale `Send` on the leaf's path (including the leaf itself) per [03 §3.3](03_runner.md#33-dispatch-step-leaf-sendnode--partition--create_attack--sequential-add_message-calls). For an all-clean-prefix leaf, N=1 (just the leaf's turn). | Each leaf gets its own `AttackResult`. No `source_conversation_id` needed. | +| Refresh an interior `SendNode` (has a `SendNode` descendant) | **Aliased to `refreshSubtree(id)` restricted to descendant leaves** (per [§6.3](#63-propagation-rules)). Each descendant leaf dispatches its own `create_attack` + N `add_message` sequence (per [03 §3.3](03_runner.md#33-dispatch-step-leaf-sendnode--partition--create_attack--sequential-add_message-calls)); the interior Send is regenerated as part of each descendant's fresh suffix, with intra-wave memoization ([03 §3.2](03_runner.md#32-what-gets-dispatched)) deduplicating shared regenerations across leaves. | No interior Send ever dispatches as its own AR; AR-per-leaf is preserved by construction. | +| Add and execute a `FanNode(axis=*)` | Per child: same as \"Refresh a leaf `SendNode`\" - each variant becomes its own `AttackResult` with its variant's payload baked into the resolved input | All siblings share the same `conversation_tree_id` label so they group in history. | +| Add `ImportMessageNode` (or auto-reverse from history - §9.3) | `GET /attacks/{id}/messages?conversation_id=…` to hydrate; no write | Read-only; no new AR. | +| Branch from node (§6.5) | Pure tree-level deep copy of root-to-node path + node's descendants, with fresh ids; **no backend call until the operator refreshes**. `branchToNewTree` (V1.0) swaps the active `currentTree`; `branchToSubtree` (V1.1) lands the slice as a sibling subtree in the same canvas. On refresh, the new leaves create new ARs under a fresh `conversation_tree_id` with `parent_conversation_tree_id` set. | Branch stays cheap; backend cost is proportional to what the operator chooses to re-execute. | +| Promote a leaf to \"main\" in history filter | (no backend call) Apply UI filter: `?label=conversation_tree_id:T` and pin one row | The backend's `POST /attacks/{id}/update-main-conversation` is for the *within-AR* notion of main, which AR-per-leaf eliminates. | +| Read execution result | `GET /attacks/{id}/messages?conversation_id=…` | Each AR has exactly one conversation under AR-per-leaf. | + +**Why interior Sends don't reuse a chain AR (alternative considered).** Earlier revisions of this table had interior Sends append to "the chain's existing `AttackResult`" via `POST /attacks/{id}/messages` against an intermediate AR. That model required intermediate ARs to exist as scaffolds spanning multiple linear Sends, and broke down at fan boundaries (no obvious AR to append to without crossing the §7.2 AR-per-leaf rule). The alias-to-leaf-dispatch rule above collapses both problems: every leaf is sovereign, and interior Sends are reachable only through their descendants — consistent with [§6.3](#63-propagation-rules) (interior Sends never appear in the dispatch ready queue) and [03 §3.3](03_runner.md#33-dispatch-step-leaf-sendnode--partition--create_attack--sequential-add_message-calls) (every dispatch is leaf-anchored, holds one concurrency slot for the whole `create_attack + N add_message` sequence). + +### 7.2 ConversationTree-to-execution materialization rule + +Under the AR-per-leaf decision (§12.1): + +1. **Each maximal linear chain ending in a leaf `Send` maps to one `AttackResult`.** + - A path from root to leaf with no fan-out crossing → 1 AR, 1 conversation, N turns (one per `Send` in the chain). + - A path that crosses a fan-out node → the boundary closes the upstream chain (which has its own AR if and only if it contains at least one `Send`) and each child variant starts a fresh AR. +2. **Each fresh AR is created via `POST /attacks` with `prepended_conversation` = the resolved input from root to the new chain's first `Send`.** No `source_conversation_id`; no intermediate AR scaffolds; no related-conversation chaining. The fresh AR carries: + - `labels.conversation_tree_id = ` - stable across the whole conversation tree, enables grouping in history. + - `labels.tree_path = ` — e.g. `'[["converter",1],["attempt",3]]'` for a leaf under nested converter-then-attempt fans. **Required in V1.0** (per [03 §4.3 tree_path encoding](03_runner.md#tree_path-encoding-v10-json-to-keep-forward-compatible)). Earlier revisions used a delimited format (`"converter=base64/attempt=3"`); the JSON encoding ships in V1.0 to avoid silent breakage if future fan-axis names contain `/` or `,`. + - `labels.operator`, `labels.operation` - inherited from the current operator (matches today's `handleBranchAttack` at [ChatWindow.tsx#L456-L475](../../../frontend/src/components/Chat/ChatWindow.tsx#L456-L475)). +3. **Lineage on prepended pieces is preserved via `MessagePieceRequest.original_prompt_id`** ([attacks.py:L202-L207](../../../pyrit/backend/models/attacks.py#L202-L207)). When the runner builds the `prepended_conversation` payload, it carries forward the source piece's UUID so the new piece's `original_prompt_id` points back. This costs nothing extra and preserves the existing PyRIT lineage primitive.\n4. **Cross-target paths are not special.** Because every leaf is already its own AR, a `FanNode(axis='target')` is no different from any other axis - the AR-per-leaf rule already produces one AR per variant. The cross-target guard ([attack_service.py:L654](../../../pyrit/backend/services/attack_service.py#L654)) only blocks *appending* messages to an AR with a mismatched target; since AR-per-leaf never appends across targets, the guard is naturally satisfied. + +#### Why `prepended_conversation` instead of `source_conversation_id` + `cutoff_index`? + +Two alternatives were considered: + +| Strategy | Calls per fan boundary | Intermediate ARs | Lineage | Verdict | +|---|---|---|---|---| +| **A: `prepended_conversation` per leaf** (chosen) | 1 `POST /attacks` per child variant | None - fresh AR each time | Explicit via `MessagePieceRequest.original_prompt_id` on each prepended piece | Simpler runner, no AR stubs, one extra field on each prepended piece is cheap | +| **B: `source_conversation_id` + `cutoff_index` chain** | 1 `POST /attacks` per fan child (with source set) | Yes - a \"scaffold\" AR per linear segment between fan boundaries | Automatic via `_duplicate_conversation_up_to` ([attack_service.py#L824-L870](../../../pyrit/backend/services/attack_service.py#L824-L870)) | More API calls, more AR rows, but matches today's `handleBranchAttack` 1:1 | + +Strategy A wins on simplicity and call count, with no fidelity loss because `original_prompt_id` is independently settable on prepended pieces. + +### 7.3 Lineage write - V1 omits it + +Revision 2 proposed writing `prompt_metadata[\"conversation_tree_node_id\"]` on each persisted piece. With client-only conversation tree persistence (§12.0), this would produce **persistent pointers to tree nodes that die with the browser tab**. The orphaned-pointer migration concern is real. + +**V1 decision: do not write `conversation_tree_node_id` into `prompt_metadata` at all.** The runner keeps tree-execution correlation in its own in-memory state (the `ExecutionRecord.pieceIds` array on each `ConversationTreeNode`); no metadata is written to the backend. Trade-offs: + +- **(−) No server-side query \"give me all pieces from tree node X\".** V1 simply doesn't need this - the conversation tree is in the same React process as the runner. +- **(+) No data poisoning.** Every `conversation_tree_node_id` ever written would have been imprecise per the duplication problem the reviewer flagged ([attack_service.py:L824-L870](../../../pyrit/backend/services/attack_service.py#L824-L870)). Not writing them avoids the question entirely. +- **(+) V2 conversation tree persistence ships clean.** When V2 introduces server-side conversation trees, it can write a fresh, namespaced metadata key (e.g. `plan_node_ref_v2: {conversation_tree_id, node_id}`) without competing with V1 noise. + +`labels.conversation_tree_id` (on `AttackResult`, **not** `prompt_metadata`) is the only metadata V1 stamps onto backend records. It survives reloads, groups history rows, and never participates in piece-level lineage - so it cannot be poisoned by `duplicate_messages`. + +### 7.4 Recommended (small) backend extensions - deferred + +Revision 2 listed three optional backend tweaks. All three are deferred: + +- **`CreateAttackRequest.metadata_overrides`** - unnecessary in V1 because we don't write piece-level lineage metadata at all. +- **`PATCH /attacks/{id}/conversation_tree`** - unnecessary because conversation tree storage is client-only. +- **Bulk per-piece metadata update** - unnecessary because we don't write piece-level metadata. + +These all return as live options when V2 (server-side conversation tree) is designed. + +**One backend ask is not deferrable** — it's a soft dependency for the operator-isolation posture (§9.1): + +- **`_validate_operator_match` must read from `AttackResult.labels["operator"]`, not `piece.labels["operator"]`.** Today the check reads the operator label from existing message pieces ([attack_service.py:L693-L694](../../../pyrit/backend/services/attack_service.py#L693)). The path that writes those piece labels ([attack_mappers.py:L502](../../../pyrit/backend/mappers/attack_mappers.py#L502)) is `removed_in="0.16.0"`. When it goes, the piece-label check silently no-ops and the server-side operator-isolation check disappears for tree-UI traffic — reducing operator isolation to a UI-only posture. The fix: relocate the check to read `AttackResult.labels["operator"]` for the AR the conversation belongs to. **Revision 9 brings this into the V1.0 PR set** — see §9.4.5 for the elevation rationale and PR sequencing. Earlier revisions treated this as a deferred PyRIT-core ask; that gamble ("someone else will fix it before 0.16.0") was too fragile for V1.0's defense-in-depth story. + +### 7.5 Storage cost - what AR-per-leaf actually costs + +For the §4.4 worked example (`Fan(3) × Fan(5) × Fan(4)` = 60 leaves): + +| Quantity | V1 (AR-per-leaf via `prepended_conversation`) | Revision 2 (one AR, many conversations) | +|---|---|---| +| `AttackResult` rows | 60 | 1 | +| `Conversation` IDs (memory rows) | 60 | 60 | +| `MessagePiece` rows | 60 prepended-as-user pieces + 60 assistant responses = 120 | ~213 duplicated pieces + 60 leaf-produced pieces ≈ 273 | +| Backend write calls | 60 `POST /attacks` + 60 `POST /attacks/{id}/messages` = 120 | 1 `POST /attacks` + 78 `POST /attacks/{id}/conversations` + 60 `POST /attacks/{id}/messages` ≈ 139 | +| History view rows (without grouping) | 60 (filterable by `label=conversation_tree_id:T`) | 1 | + +AR-per-leaf trades **more `AttackResult` rows** (60 vs. 1) for **fewer total pieces** (120 vs. 273), **simpler runner code** (no chained source_conversation_id walks), and **richer history filtering** (each leaf is independently queryable). The history view bloats and needs a `conversation_tree_id` filter affordance - noted in §9.4. + +## 8. Renderer & Layout + +### 8.1 Renderer choice - react-flow, with the door open + +The renderer is **`@xyflow/react`** (react-flow v12) for V1. The reasoning is honest, not religious: + +| Option | Bundle (gzipped) | Tree fit | DAG fit | Pan/zoom built-in | Custom node components | V1 effort | Verdict | +|---|---|---|---|---|---|---|---| +| **`@xyflow/react`** | ~45 KB | Good | Good | Yes | First-class | Lowest - install + 1 day of glue | **Chosen** | +| Roll our own (SVG + CSS Grid + a pan-zoom hook) | ~5 KB | Good | OK | No - we'd write it | First-class | ~2 weeks of polish to reach react-flow's baseline | Saves ~40 KB; not worth the time | +| Cytoscape.js + `react-cytoscapejs` | ~150 KB | Good | Excellent | Yes | OK - not as React-native | Medium | Overkill; less idiomatic for React | +| D3 directly | ~60 KB (modules) | Good | Good | Manual | Manual | High - we'd be writing react-flow ourselves | Rejected | +| Mermaid (render-only) | ~600 KB | Excellent visuals | Excellent | Implicit | None - it's a renderer | N/A | Static; can't edit | + +The bundle-size win of rolling our own (~40 KB) is real but small relative to the existing app (~500 KB of Fluent UI), and the polish work (focus management, edge routing, selection multi-state, viewport persistence) is exactly the work react-flow exists to do. + +**Lock-in is mitigated by the §8.3 abstraction:** the conversation tree model knows nothing about react-flow. A single `conversationTreeToReactFlow` adapter is the only file that imports `@xyflow/react`. If we hit a wall (perf with 1000+ nodes; a11y issues), the swap surface is one module. + +### 8.2 Layout choice - Buchheim-Walker via `d3-hierarchy` + +Revision 3 originally recommended a custom recursive DFS layout. Revision 4 upgrades to **Buchheim-Walker (tidy tree)** via [`d3-hierarchy`](https://github.com/d3/d3-hierarchy) for the same time complexity, tighter horizontal packing, and better stability under edit. The choice and the wider layout architecture (main-path pinning, adaptive stack collapse, edge routing, animation policy) are fully argued in [02_tree_ui_affordances.md §4](02_tree_ui_affordances.md#4-layout); this section is the abbreviated rationale. + +| Algorithm | Bundle cost | Tightness | Equal-subtree symmetry | Stability under edit | Verdict | +|---|---|---|---|---|---| +| Custom recursive DFS (sum of child widths) | 0 | Loose | Yes | OK | Was revision 3's choice; superseded | +| **`d3-hierarchy.tree()` (Reingold-Tilford / Buchheim-Walker)** | ~10 KB gzipped | Tight (subtree contours interleave) | Yes | Good | **Chosen** | +| `dagre` (`rankdir=TB`) | ~30 KB gzipped | Good | No | OK | DAG-oriented; overkill | +| `elkjs` (`mrtree`) | ~400 KB gzipped | Best in class | Yes | Good | Bundle cost too high | +| Force-directed | ~50 KB | Variable | No | Bad | Wrong shape for our tree | + +**Why upgrade from custom DFS:** naive DFS reserves `Σ width(children)` for every parent, which wastes horizontal space when subtrees are very different sizes. Buchheim-Walker lets small subtrees nestle into the gaps of large ones, often halving total width. Our typical tree has wide fan-outs next to narrow chains, so the tightness win is substantial. The +10 KB bundle cost is paid by `d3-hierarchy` only - we do NOT depend on the rest of `d3`. + +**Three layers, applied in order** (full pseudo-code in [02_tree_ui_affordances.md §4.3](02_tree_ui_affordances.md#43-recommendation-buchheimwalker--pinned-main-path--adaptive-collapse)): + +1. **Pinned main path.** If any leaf is starred (§2.2 in the affordances doc), pin every node on the root→starred-leaf chain to a fixed centerline x. Off-main subtrees lay out to one side. +2. **`d3-hierarchy.tree()` for off-main subtrees** with the main-path-side contour treated as a wall. +3. **Render-time stack collapse** for nodes that the parent-walk peer rule (see [02_tree_ui_affordances.md §3](02_tree_ui_affordances.md#3-the-stack--two-distinct-visual-aggregations)) identifies as Stack peers. + +**Edge routing:** `type: 'smoothstep'` (orthogonal with rounded corners) - mirrors org-chart conventions which operators read top-down. Reasoning in [02_tree_ui_affordances.md §4.4](02_tree_ui_affordances.md#44-edge-routing). + +### 8.3 The conversation tree → renderer adapter + +```ts +// One ConversationTreeNode → one rendered React Flow Node. +import type { Node as RfNode, Edge as RfEdge } from '@xyflow/react' + +type RfData = { node: ConversationTreeNode } // union narrows by node.kind inside the component + +function conversationTreeToReactFlow(tree: ConversationTree, layout: LayoutFn): { nodes: RfNode[]; edges: RfEdge[] } { + const positions = layout(tree) + const nodes = tree.nodes.map(p => ({ + id: p.id, + type: p.kind, + position: positions.get(p.id)!, + data: { node: p }, + })) + const edges = tree.edges.map(e => ({ + id: e.id, + source: e.parentId, + target: e.childId, + sourceHandle: `slot-${e.slotIndex}`, + animated: nodeIsRunning(e.childId), + })) + return { nodes, edges } +} +``` + +The conversation tree model and the layout engine are both pluggable. The renderer is the only piece bound to a specific library. + +Each `kind` registers a custom React component in `nodeTypes`. The component receives `data.node` and renders: + +- Header: kind badge + node title (e.g. truncated prompt) + state pill (clean/edited/stale/running/failed/cancelled) +- Body: kind-specific (e.g. `UserTurnNode` shows the text with an inline `Edit` affordance; `FanNode` shows axis + variant count) +- Footer: action row - `Refresh`, `Branch` (📋, label varies by context — see §6.5), `Add child`, `Delete` + +Fan nodes render N source handles on their bottom edge so each output slot is a distinct connection point. + +### 8.4 Accessibility & performance + +**Accessibility:** react-flow's a11y posture is thin (keyboard nav between nodes, screen reader announcements). V1 must not regress the existing Fluent UI keyboard accessibility, so we add: + +- Arrow keys traverse parent / child / sibling (with focus ring). +- Enter opens the node's inline editor; Space refreshes; Shift+Enter refreshes subtree. +- `aria-live` polite announcements for state transitions (`"Node X is running"`, `"Node X completed"`). + +Whether this ships in V1 or as a follow-up is §12.7. + +**Performance:** react-flow v12 doesn't virtualize off-viewport nodes. Combined with the storage cost in §7.5, this informs the soft caps in §9.4 (warn at 200 leaves, refuse fan-outs that would exceed 1000). + +## 9. Multi-Operator, Migration, and Multi-Tab + +The reviewer of revision 1 correctly flagged three blockers that the original doc never addressed. They are foundational, so they get their own section. + +### 9.1 Operator isolation posture + +> **What ships in V1.0 (read this first).** Operator isolation in V1.0 is a **three-layer posture**: (1) the visual 🔒 lock + mutating-affordance disablement on nodes whose latest AR carries a different operator tag (UI); (2) the runner's pre-wave **tag-hygiene gate** ([03 §2.1 entry-point shim step 1](../doc/gui/design/03_runner.md#entry-point-shim-ordering-v10)) that aborts any refresh whose `currentOperator()` is null/empty so no untagged AR ever reaches the backend; (3) the server-side `_validate_operator_match` check (relocated per [§9.4.5](#945-hard-backend-dependency-relocate-_validate_operator_match)) as defense-in-depth against non-tree-UI clients (a second browser tab using the API directly, a Python script). Under AR-per-leaf the server-side check **rarely fires by construction** for tree-UI traffic, because the runner always creates its own AR with its own tag. Point 5 below spells out why. **Reframing note (Q.S.2 DECIDED V1.0: operator-as-tag, rev 18 per rubber-duck Finding B.2):** `operator` is a tag the operator picks for History grouping + per-operator AR isolation, **not an auth claim**. The tag is honor-system — a determined operator can set it to any value, including impersonating another operator's tag; the V1.0 posture defends against accidental mis-attribution and casual cross-operator extensions, not against motivated bypass. The "Branch from here is the escape hatch" framing in point 3 below is the consequence: any operator can branch any tree they can read (the source AR was already visible to them in History), creating a fresh AR under their own tag with no auth gate. V1.1 multi-operator collaboration ([§13.8](#138-multi-operator-collaboration-v2)) revisits whether the tag should become a claim; V1.0 ships honor-system. + +The existing GUI enforces operator isolation in two places: + +- **Frontend** ([ChatWindow.tsx#L494-L498](../../../frontend/src/components/Chat/ChatWindow.tsx#L494-L498)): when the loaded attack's `labels.operator` differs from the current user's operator, the entire conversation is read-only. +- **Backend** ([`_validate_operator_match` at attack_service.py#L682](../../../pyrit/backend/services/attack_service.py#L682)): `add_message` raises if the request operator does not match the operator label on existing message pieces in the conversation. **§9.4.5 elevates the relocation of this check to the V1.0 PR set** — once it lands, the check reads from `AttackResult.labels["operator"]` (survives 0.16.0 deprecation). The check retains its existing no-labels early-return behavior: anonymous requests (no `operator` key in `request.labels`) pass through unchallenged, consistent with the operator-as-tag framing — the tag is honor-system, not an auth claim. + +The tree view must respect both. Under AR-per-leaf (§7.2): + +1. **Visual lock (primary line of defense under V1.0 runner).** When a conversation tree node's most recent `ExecutionRecord.attackResultId` resolves to an AR with `labels.operator != currentOperator`, render that node with a "locked" badge and disable mutating affordances (`Refresh`, `Edit`, `Add child`, `Delete`). `Branch from here` / `Clone tree` is still allowed — it creates a fresh AR owned by the current operator under a new `conversation_tree_id`. **The visual lock is the only lock that fires for typical V1.0 traffic** — see #5 below for why. +2. **API-level lock (defends against non-tree-UI clients with `operator` labels set).** The runner catches the 400 from the §9.4.5-relocated `_validate_operator_match` and surfaces it gracefully as "node failed - operator mismatch". This fires when a non-tree-UI caller (a second browser tab using direct API access, a Python script) sends a request whose `labels.operator` is *non-empty AND mismatched* against the existing AR's tag. Anonymous callers (no `operator` label) bypass the check by design per the operator-as-tag framing — the tag is honor-system; the API does not pretend to enforce identity. The main value of this layer is defending against operators who set their tag *correctly* but reach for a tree another operator owns. +3. **Branch-into-own-tree as the escape hatch.** Matches the existing "Continue with your target" affordance ([ChatWindow.tsx#L519-L546](../../../frontend/src/components/Chat/ChatWindow.tsx#L519-L546)). **Consistent with operator-as-tag (Q.S.2 rev 18):** any operator who can read the source AR can branch it under their own tag — no auth gate, no confirmation modal naming the cross-operator boundary. If V1.1 promotes `operator` to a claim, this primitive needs a confirmation step; V1.0 ships escape-hatch-as-default. +4. **AR-per-leaf simplifies the lock granularity.** Each leaf is its own AR; mixed-operator trees are possible (e.g., the operator imported one leaf from operator A but added their own siblings). The visual lock applies node-by-node, not tree-wide. +5. **The V1.0 runner's API-level lock rarely fires by construction.** Under AR-per-leaf, every `add_message` the runner sends targets an AR the runner *just created* with its own labels — the AR's operator and the request's operator always match. The server-side check therefore never produces a rejection along the runner's normal dispatch path. The check's value under V1.0 is bounded to (a) detecting tree-UI bugs that violate the labeling invariant, and (b) blocking non-tree-UI clients per #2. **Operators must understand that the visual 🔒 badge is purely client-side under V1.0** — it derives from `AttackResult.labels["operator"]` read locally, and a determined non-runner caller with API access could ignore it. Server-side enforcement only fires if the offender bypasses the runner. +6. **The runner sets `request.labels["operator"]` on every `add_message` call** (invariant). This costs nothing today (the existing chat already does it), provides a clean post-0.16.0 path once the backend reads from `AttackResult.labels`, and means the visual lock and the server-side check agree on the same identity. Auto-reverse migration (§9.3) inherits each historical AR's `labels.operator` unchanged. + +### 9.2 Cross-target locking - not a special case under AR-per-leaf + +In revision 2 this was a dedicated subsection. Under AR-per-leaf it dissolves: + +- Every leaf already gets its own AR (§7.2). A `FanNode(axis='target')` produces N ARs the same way any other fan does - each child's `prepended_conversation` payload includes the variant's target. +- The backend's `_validate_target_match` ([attack_service.py:L654](../../../pyrit/backend/services/attack_service.py#L654)) only blocks *appending* a message with a mismatched target to an existing AR. Since AR-per-leaf never crosses targets within an AR, the guard is naturally satisfied. +- **What the UI still owes the operator:** a clear visual indicator on a `FanNode(axis='target')` that says "spawns N independent attack results" - since the cost (N rows in history) is operator-visible. + +### 9.3 Migration of existing linear attacks - auto-reverse to a tree + +> **Version scope.** V1.0 ships **(1)** linear-chain reconstruction with per-piece converter pipelines (each user-role `Message` becomes a `UserTurnNode` with converter pipeline hydrated from `MessagePiece.converter_identifiers`; each assistant-role `Message` becomes a `SendNode` rebound to its existing pieces, no re-execution), AND **(2)** the V1.0+ fast-path `detect_fans_v10_plus` algorithm (§9.3.1 Algorithm 1) that decodes `labels.tree_path` to reconstruct nested fan structure for any tree the V1.0 runner produced. This is the load-bearing path for [§9.4.1 reload-reconstruction](#941-reload-reconstruction-v10) — V1.0 sessions reload with their authored tree shape intact. **V1.1 adds the pre-tree-UI fallback** `detect_fans_pre_v10` (§9.3.1 Algorithm 2) for historical ARs that lack `tree_path`; the V1.1 cut surface is concentrated in that algorithm's edge cases (wave_id disambiguation, nesting-loss caveat, hard-deletion handling). The fallback is operator-flagged as "not too important for now" because the dominant historical-attack shape in the PyRIT corpus is single-conversation, and the V1.0 linear reconstruction already covers >90% of pre-V1.0 "Open in tree" use cases without inventing fan-axes the original conversation never had. + +Under §12.6, V1 reverse-engineers an existing AR's conversations into an editable conversation tree by default. The mapping: + +| Backend artifact | ConversationTree node | Version | +|---|---|---| +| User-role `Message` | `UserTurnNode { role: 'user', text, attachments, converterPipeline }` - the converter pipeline is hydrated from `MessagePiece.converter_identifiers` ([message_piece.py:L114](../../../pyrit/models/messages/message_piece.py#L114)) | V1.0 | +| Assistant-role `Message` | `SendNode` whose `execution` wraps the existing pieces (no re-execution; just rebind) | V1.0 | +| Simulated-assistant `Message` | `UserTurnNode { role: 'simulated_assistant' }` - inert by construction | V1.0 | +| System `Message` | `UserTurnNode { role: 'system' }` at the top of the chain (or hoisted into the root prompt's `systemPrompt`) | V1.0 | +| `AttackResult.related_conversations` (the historical `handleBranchConversation` results) | Fan-grouped via the §9.3.1 algorithm: leaves sharing a lineage root collapse into an implicit `FanNode(axis='prompt')` at the divergence point. | **V1.1** | + +#### 9.3.1 Fan-grouping algorithms + +> **Version scope.** Algorithm 1 (V1.0+ fast path via `tree_path`) ships in V1.0. Algorithm 2 (pre-V1.0 fallback via `original_prompt_id` chain-flattening + `wave_id` disambiguator) ships in V1.1. The dispatcher in §9.3.2 picks based on label presence. + +The V1.1 fanout detection is the only V1.1 algorithm in §9.3, and it has a cleaner implementation than earlier revisions claimed thanks to a property of `Message.duplicate()` ([message.py:L392-L412](../../../pyrit/models/messages/message.py)) that the previous revision missed. + +**The flattening property.** [`Message.duplicate()`](../../../pyrit/models/messages/message.py) sets `piece.id = uuid.uuid4()` on the new piece but **does not touch `original_prompt_id`** — it explicitly comments "intentionally kept the same to track the origin." Combined with the [`_set_original_prompt_id_default` validator at message_piece.py:L182-L190](../../../pyrit/models/messages/message_piece.py) which defaults `original_prompt_id` to `self.id` when None on first construction, the result is: + +- For any fresh piece P: `P.original_prompt_id == P.id` (origin marker). +- For any duplicate D of P (or of *any duplicate of P*, transitively): `D.original_prompt_id == P.id` (root marker). + +Duplication chains **flatten** to a single hop. Walking N levels of duplication is unnecessary; `original_prompt_id` always points at the lineage root. This collapses the fan-grouping primitive from "recursive chain walk" to "hash-bucket group-by." + +**Two algorithms, one fast path and one fallback.** Revision 10 splits §9.3.1 into two cases: + +1. **V1.0+ trees (fast path): decode `labels.tree_path`.** Trees produced by the V1.0 runner stamp every leaf AR with `labels.tree_path` = JSON-encoded array of `[axis, slotIndex]` tuples from root to leaf (e.g., `'[["prompt",2],["attempt",3]]'` for a leaf under a nested prompt-then-attempt fan structure). Full encoding spec in [03 §4.3 `tree_path` encoding](03_runner.md#tree_path-encoding-v10-json-to-keep-forward-compatible) — chose JSON over the earlier `/,...` delimiter format so future axis names can contain arbitrary characters without breaking the parser. This is a complete description of the leaf's position in the tree's fan structure, including **nested fans**. The auto-reverse algorithm decomposes the labels directly and reconstructs the exact tree shape — no chain-walking needed, no nesting lost. +2. **Pre-tree-UI ARs (fallback): the `original_prompt_id` chain-flattening algorithm below.** Existing pre-V1.0 ARs do not have `tree_path`. The algorithm groups by lineage root with `wave_id` disambiguation and synthesizes implicit `axis='prompt'` fans. **Nesting is lost** — pre-V1.0 ARs with nested fans (e.g., 3 prompts × 5 attempts = 15 leaves) reconstruct as one flat 15-member fan, because the lineage-flattening algorithm only sees the outermost divergence point per leaf. This is the V1.0-fidelity floor for historical data; V1.0+ trees do strictly better. + +**Algorithm 1 — V1.0+ trees (tree_path fast path, V1.0):** + +```python +def detect_fans_v10_plus(leaf_ars: list[AttackResult]) -> list[ImplicitFan]: + """V1.1 auto-reverse for V1.0+ trees. Reconstructs nested fan structure + by decoding the tree_path label written by the runner ([03 §4.3]).""" + # Step 1: parse each leaf's tree_path into a list of (axis, slotIndex) pairs. + # Example: '[["prompt",2],["attempt",3]]' -> [('prompt', 2), ('attempt', 3)] + # Empty tree_path (no fan ancestors) -> []. + leaf_paths = {ar.id: parse_tree_path(ar.labels.get('tree_path', '')) for ar in leaf_ars} + + # Step 2: build the fan tree bottom-up. Two leaves share a fan iff their + # tree_paths agree on every (axis, slotIndex) pair up to some prefix length, + # then differ. The fan sits at the depth where they diverge. + # Group leaves by their parent fan (= their tree_path minus the last segment). + fans: list[ImplicitFan] = [] + by_parent_path = defaultdict(list) + for ar in leaf_ars: + path = leaf_paths[ar.id] + if not path: + continue # no fan ancestors + parent_path_key = tuple(path[:-1]) + last_axis, last_slot = path[-1] + by_parent_path[parent_path_key].append((ar, last_axis, last_slot)) + + for parent_path, group in by_parent_path.items(): + if len(group) < 2: + continue + # All members of this group share the same parent fan. Operators CAN change + # a fan's axis mid-tree (the [02 §2.2] ≡ icon with confirmation), in which + # case leaves dispatched before the change carry the old axis in their + # tree_path and leaves dispatched after carry the new axis. Split into one + # ImplicitFan per axis at the same parent_path so the operator sees the + # post-hoc structure honestly: "the fan was attempt then became converter." + by_axis: dict[str, list[tuple[AttackResult, str, int]]] = defaultdict(list) + for member in group: + by_axis[member[1]].append(member) + for axis, axis_group in by_axis.items(): + if len(axis_group) < 2: + continue + fans.append(ImplicitFan( + parent_path=parent_path, # nesting position; can be empty (top-level fan) + axis=axis, # exact, not synthesized + member_ars=[g[0] for g in axis_group], + member_slot_indices=[g[2] for g in axis_group], + )) + return fans # nesting is reconstructable from each fan's parent_path +``` + +**Variant-payload reconstruction (per V1.0 axis).** Algorithm 1 reconstructs the *topology* of each `FanNode` (its axis, its slot count, the leaves at each slot) but does not populate `FanNode.params.variants[i].payload`. Without per-axis derivation the reload produces fan nodes with empty variant payloads — visually present, functionally inert. For `axis='converter'` this is a silent corruption: a 3-slot converter fan reloads with `variants[i].payload.converters = []` for all `i`, and the next refresh fires WITHOUT the converters operators authored. The derivation per V1.0 axis: + +| Axis | Variant payload shape (per [§4.4](#44-structural-nodes--the-single-fan-out-primitive)) | V1.0 derivation rule | +|---|---|---| +| `attempt` | `Record` (empty) | No-op. All slots share the empty payload by definition. | +| `converter` | `{ converters: ConverterRef[] }` | For each `ImplicitFan.member_ars[i]` at slot `s = member_slot_indices[i]`: read the leaf's `prepended_conversation`; find the user-turn at depth `len(parent_path) + 1` from the root (the user-turn the fan child's Send consumes); read its first piece's `converter_identifiers` field ([§9.4.4 (b)](#944-hard-backend--frontend-type-dependencies-for-v10) DTO ext). Assign `variants[s].payload.converters = ConverterRef.fromIdentifiers(piece.converter_identifiers)`. The same `s` may appear in multiple `member_ars` (multiple leaves at the same slot, e.g. the slot is itself nested inside an outer fan); deep-equal across all of them and pick the consensus value. **Divergence handling:** if leaves at slot `s` disagree on `converter_identifiers` (operator manually edited one leaf's user-turn after auto-reverse but before the new wave, or a partial-wave failure left the slot in an inconsistent state), the algorithm picks the most-frequent value across `member_ars` at `s` and renders a warning chip on the fan card: *"Slot `s` reconstruction: N leaves disagreed on converter pipeline. Showing the most-frequent value; review the slot before refreshing."* + +```python +def reconstruct_variant_payloads(fan: ImplicitFan) -> list[FanVariant]: + """Reconstructs FanNode.params.variants for a fan reconstructed by Algorithm 1. + The output array is indexed by slotIndex; gaps in the slot space (deleted slots) + are filled with the axis's empty/default payload.""" + if fan.axis == 'attempt': + # All slots share the empty payload by definition. + max_slot = max(fan.member_slot_indices) + return [FanVariant(axis='attempt', payload={}) for _ in range(max_slot + 1)] + if fan.axis == 'converter': + by_slot: dict[int, list[list[ConverterRef]]] = defaultdict(list) + for ar, slot in zip(fan.member_ars, fan.member_slot_indices): + user_turn_piece = _find_user_turn_at_depth(ar.prepended_conversation, len(fan.parent_path) + 1) + converters = ConverterRef.from_identifiers(user_turn_piece.converter_identifiers) + by_slot[slot].append(converters) + variants: list[FanVariant] = [] + max_slot = max(by_slot.keys()) + for s in range(max_slot + 1): + candidates = by_slot.get(s, []) + if not candidates: + variants.append(FanVariant(axis='converter', payload={'converters': []})) + else: + payload, divergence = _consensus_or_most_frequent(candidates) + if divergence: + _emit_reconstruction_warning(fan, s, candidates) + variants.append(FanVariant(axis='converter', payload={'converters': payload})) + return variants + raise NotImplementedError(f"V1.0 ships axis={fan.axis} but reconstruction is not wired; see V1.1 axis-extension plan") +``` + +**V1.1 axes (`prompt`, `target`, `system_prompt`, `temperature`)** each need their own derivation hook. The reload path uses the same per-axis dispatch above; each new axis adds one case. The derivation source per future axis: + +- `axis='prompt'`: read the first user-turn after the fan boundary; its text + attachments become the variant payload. The leaf's prepended_conversation already carries them. +- `axis='target'`: read each leaf AR's `target_registry_name` directly (an AR-level field, not a piece field). +- `axis='system_prompt'`: read the first prepended message with `role='system'` per [03 §3.3a `_systemPrompt_as_prepended_message`](03_runner.md#33a-helpers-referenced-by-the-dispatch-step) — the runner writes system prompts as the first prepended message, so reload reads the same position. +- `axis='temperature'`: NOT recoverable from current backend state — the temperature value is sent to the target but not persisted on the AR or its pieces. V1.1 axis-extension PR for `temperature` must add a runner-side label (`labels.fan_variant_temperature = '0.7'`) or carry the value on a new AR field; defer to that PR. Adding it as an inline note here so the V1.1 axis-extension PR doesn't miss the persistence question. + +**Algorithm 2 — pre-tree-UI ARs (original_prompt_id fallback, V1.1):** + +```python +def detect_fans_pre_v10(leaf_ars: list[AttackResult]) -> list[ImplicitFan]: + """V1.1 auto-reverse for pre-V1.0 ARs (no tree_path label). Operates on + leaf ARs sharing one conversation_tree_id (or one source AR for genuinely + pre-tree-UI history).""" + # Step 1: index pieces by lineage root. + # For each leaf AR, find the first piece in its prepended_conversation where + # original_prompt_id != id (i.e. the first duplicated piece). That piece's + # original_prompt_id is the divergence point for this leaf's lineage. + by_lineage_root: dict[uuid.UUID, list[tuple[AttackResult, MessagePiece]]] = defaultdict(list) + for ar in leaf_ars: + for piece in ar.prepended_conversation_pieces: + if piece.original_prompt_id != piece.id: + by_lineage_root[piece.original_prompt_id].append((ar, piece)) + break # first divergence point only (the nesting-loss gap; see below) + + # Step 2: within each lineage-root bucket, disambiguate fan vs. exploration + # by wave_id. Same wave_id = fan members (one operator action). Different + # wave_id = separate explorations branching from the same point over time. + fans: list[ImplicitFan] = [] + for root_piece_id, candidates in by_lineage_root.items(): + if len(candidates) < 2: + continue # not a fan; just a linear chain with one duplicated turn + by_wave: dict[str, list[AttackResult]] = defaultdict(list) + for ar, _piece in candidates: + by_wave[ar.labels.get('wave_id', '')].append(ar) + for wave_id, ars in by_wave.items(): + if len(ars) >= 2: + fans.append(ImplicitFan( + divergence_piece_id=root_piece_id, + axis='prompt', # the only axis we can infer post-hoc + member_ars=ars, + reconstructed_from_wave_id=wave_id or None, + nesting_lost=True, # see "Nesting loss" caveat below + )) + return fans +``` + +#### 9.3.2 Dispatcher + +```python +def detect_fans(leaf_ars: list[AttackResult]) -> list[ImplicitFan]: + """Pick the right algorithm based on whether the ARs are V1.0+ (have tree_path) or pre-V1.0. + + V1.0 ONLY ships detect_fans_v10_plus; V1.1 adds detect_fans_pre_v10. In V1.0: + - All-V1.0+ leaves: full reconstruction via Algorithm 1. + - Any leaves missing tree_path: those leaves render as flat under their conversation_tree_id + (no implicit fans synthesized). Acceptable for V1.0 because pre-V1.0 ARs are bounded + historical corpus; V1.0-produced trees always carry tree_path. + + In V1.1: + - All-V1.0+ leaves: same as V1.0 (Algorithm 1). + - All pre-V1.0 leaves: Algorithm 2. + - Mixed presence under one conversation_tree_id (e.g., a long-running attack that spans + the V1.0 release boundary): falls back ENTIRELY to detect_fans_pre_v10 over all leaves. + This trades fidelity (loses nesting on the V1.0+ leaves that could have used the fast + path) for CONSISTENCY: a single tree's reconstructed shape never has two disjoint fan + systems that don't relate to each other. Operators see one topology, even if it's the + flatter one. The mixed-presence case is uncommon enough that the fidelity loss is + acceptable. + """ + has_tree_path = [ar for ar in leaf_ars if 'tree_path' in ar.labels] + no_tree_path = [ar for ar in leaf_ars if 'tree_path' not in ar.labels] + # V1.0 branch: only Algorithm 1 exists; leaves without tree_path render flat. + if not FEATURE_FANOUT_DETECT_PRE_V10: + return detect_fans_v10_plus(has_tree_path) + # V1.1 branch: full dispatcher + if has_tree_path and no_tree_path: + return detect_fans_pre_v10(leaf_ars) + if has_tree_path: + return detect_fans_v10_plus(has_tree_path) + return detect_fans_pre_v10(no_tree_path) +``` + +**Why `wave_id` is a required disambiguator in the fallback algorithm:** a tree whose root prompt is refreshed three times produces three distinct waves of leaves, all sharing lineage roots at the root prompt's pieces. Without `wave_id`, the algorithm would synthesize one giant `FanNode(axis='prompt')` with all three waves' leaves bundled — *wrong*: those were three separate operator actions, not one fan-out. With `wave_id`, the same lineage root produces three separate `ImplicitFan`s, each correctly grouping one wave's leaves. The `wave_id` field is required for correctness of the fan-vs-explorations distinction; demoting it to "bonus" would silently mis-group the most common operator workflow. + +**Special case: leaves without `wave_id` (pre-tree-UI ARs).** Pre-V1.0 ARs have no `wave_id` label. They land in the empty-string bucket; if 2+ leaves share a lineage root and all have empty `wave_id`, the algorithm still synthesizes a fan but tags it `reconstructed_from_wave_id: null` so the operator sees a "best-guess fan" badge in the UI. This is the V1.0-fidelity floor for pre-tree-UI history; V1.1 trees do strictly better via the `tree_path` fast path above. + +**Nesting loss in the fallback (acknowledged caveat).** The `break # first divergence point only` line in `detect_fans_pre_v10` stops at the *outermost* lineage divergence. A pre-V1.0 tree with nested fans (e.g., `Fan(prompt, 3) × Fan(attempt, 5)` = 15 leaves) reconstructs as **one** flat fan with 15 members rooted at the outer divergence point — the inner attempt-fan structure is lost. The `ImplicitFan.nesting_lost: bool` flag surfaces this honestly in the UI ("reconstructed from history — original nesting unrecoverable"). V1.0+ trees do not have this loss because `tree_path` preserves nesting. + +**Edge cases handled:** + +- **Cross-conversation lineage** (lineage chains spanning `conversation_id`s): the algorithm doesn't care — `original_prompt_id` is the only key it reads. The PyRIT `duplicate_messages` machinery ([memory_interface.py:L996-L1020](../../../pyrit/memory/memory_interface.py)) sets `conversation_id = new` on duplicates but leaves `original_prompt_id` pointing at the source piece (potentially in a different conversation). ✓ +- **Hard-deletion of intermediate pieces** (orphaned lineage): if the root piece P is hard-deleted from the backend, every descendant still carries `original_prompt_id = P.id` but cannot resolve P for display. The algorithm treats this as "valid lineage root with no displayable parent" — fan-grouping proceeds; the implicit FanNode renders with a "source piece no longer in memory" badge. ~3 LOC defensive check at indexing time. +- **`original_prompt_id` nullability** (in theory): per the `_set_original_prompt_id_default` validator, persisted pieces always have a non-null `original_prompt_id`. The frontend DTO type can declare `original_prompt_id: string` (not `string | null`) once exposed via the §9.4.4 hard backend dependency. The patch #5 algorithm relies on non-null. +- **Multiple branches from the same UserTurn over time** (3 separate explorations on day 1, 4, 9): all converge at the same lineage root P. Different `wave_id`s per branch → three separate `ImplicitFan`s, not one fan-with-3-variants. Operator gets accurate visual representation of "I explored from here three times." + +#### 9.3.3 Backend dependency (now hard — see §9.4.4) + +§9.3 historically called the DTO extension a "soft" dependency. **Revision 9 elevates it to a hard dependency** because §9.4.1 reload-reconstruction depends on it; the full statement and sequencing is in §9.4.4. The required additions to `BackendMessagePiece` (DTO + mapper + frontend type) remain: + +- `converter_identifiers: list[ComponentIdentifierField]` — V1.0 needs this to render reconstructed `UserTurnNode`s with the right converter pipeline; otherwise V1.0 auto-reverse silently produces empty-pipeline turns indistinguishable from "no converter used." +- `original_prompt_id: string` — V1.0 ships this preemptively (V1.0 doesn't read it; V1.1 fanout-detection §9.3.1 does). One PR, no surprises later. + +The change is small (~5 lines across `pyrit/backend/models/attacks.py`, `pyrit/backend/mappers/attack_mappers.py`, `frontend/src/types/index.ts`) and self-contained. The V1.0 PR set carries it; see §9.4.4. + +#### 9.3.4 Fidelity caveats (V1, all acknowledged) + +- The conversation tree is a *fiction*: the original conversations were not authored as a conversation tree, and the reverse mapping has to invent fan axes for branches that were operator-chosen. The §9.3.1 algorithm always synthesizes `axis='prompt'` because no other axis can be inferred from the post-hoc data. We label these implicit fans visually (`"reconstructed from history"`). V1.0 sidesteps the problem entirely by not synthesizing fans. +- **Hard-deletion fallback** (V1.1; covered above): orphaned lineage roots render with a "source piece no longer in memory" badge. +- Converter pipeline reconstruction reads only what the piece records; if the original converter was an inline (unregistered) one, we surface it as a non-editable badge. +- **For V1-produced trees (round-trip fidelity).** The runner always writes `labels.conversation_tree_id`, `labels.wave_id`, `labels.wave_trigger_kind` (§14.4), and `labels.tree_path` (§9.3.1 fast path) on every leaf AR. **V1.0 auto-reverse runs `detect_fans_v10_plus` on these ARs** — the `tree_path` JSON-encoded `[[axis, slot], ...]` array reconstructs the exact tree shape including nested fans AND the original fan-axis intent (`attempt`, `converter`). V1.0 trees round-trip cleanly without depending on V1.1. **Pre-V1.0 ARs (no `tree_path` label)** render as flat under their `conversation_tree_id` in V1.0; **V1.1 adds `detect_fans_pre_v10`** which synthesizes `axis='prompt'` fans for them via the lineage-flattening algorithm. + +**`ImportMessageNode` remains in the kind set** for operators who want the read-only fast path (§4.1) - useful for very long historical attacks where materializing 200 tree nodes is overkill. + +### 9.4 Client-only mode: reload reconstruction + remaining limitations + +Under the V1 client-only decision (§12.0), conversation trees live in React state. Earlier revisions accepted "reload loses everything" as the operator-visible cost; **revision 9 rewrites this section** to use server metadata for reconstruction (the refresh waves already write enough labels to rebuild the tree shape on reload), demoting the cost to "edits made since the last Refresh are lost." + +#### 9.4.1 Reload reconstruction (V1.0) + +On every `Workspace` mutation that establishes which tree is foregrounded, the URL fragment carries `?conversation_tree_id=` so reload deterministically picks up the same tree. + +On reload, the boot sequence is: + +0. **Schema-version check (V1.0).** Read `pyrit.schemaVersion`. If absent OR not equal to the current version (`'1'` in V1.0), wipe every `pyrit.*` sessionStorage key, write the current version, and surface a one-line toast: *"Saved settings were from a different version and have been reset."* The remaining steps then run as if sessionStorage were empty (each lookup misses, each fail-soft path runs). Full rationale and drop-on-mismatch contract in [§13.1 Schema versioning](#131-v10-minimal-workspace). +1. Read `conversation_tree_id` from the URL fragment (or `sessionStorage` fallback for browsers/operators that strip fragments). +2. If absent → start with empty Workspace (greenfield). +3. If present → call `GET /api/attacks?labels.conversation_tree_id=` (existing endpoint; uses the History tab's existing filter machinery). +4. Run the auto-reverse mapping (§9.3) over the returned ARs to rebuild the tree. +5. **Hoist tree-level metadata from leaf labels.** Read `labels.parent_conversation_tree_id` from any returned leaf AR; if present and all leaves agree, set `tree.parentConversationTreeId` to that value. (Assert all leaves agree — the runner writes the same `parent_conversation_tree_id` on every leaf of a cloned tree per [§13.3](#133-conversationtree-typedef-v10); divergence indicates a multi-clone-source merge that V1.0 doesn't produce, so we fail-soft to `null` with a console warning rather than picking one arbitrarily.) Without this hoist (reviewer rev-16 Finding 5), reload silently loses the parent pointer; History "Open clones of T" navigation breaks for any tree reloaded mid-session. +6. The reconstructed tree is rendered identically to a tree that was authored in this session. + +**What survives reload:** + +- Every leaf with at least one completed execution (the AR carries the lineage labels per §9.4.4). +- Per-leaf converter pipelines (V1.0; via `MessagePiece.converter_identifiers` per §9.3 — gated on the §9.4.4 hard backend dependency). +- The `conversation_tree_id` grouping (filter-driven; cheap). +- For V1.1+ trees: fan groupings, picked-child state (read from labels). + +**What does NOT survive reload (V1.0 acknowledged cost):** + +- **Structural edits since the last Refresh.** A `UserTurnNode` added but never refreshed has no backend AR; reload doesn't see it. Operator surface for this: the §9.4.2 `beforeunload` guard. +- **Fan structure for pre-V1.0 ARs only.** V1.0 auto-reverse runs `detect_fans_v10_plus` (§9.3.1) on every reload, decoding `labels.tree_path` to reconstruct exact nested fan structure for any tree produced by the V1.0 runner. **Operators reloading a V1.0 session see their full tree shape restored** — same fan layout, same `promotedChildSlotIndex` selections lost (next bullet), same per-leaf converter pipelines. The V1.1 cut is `detect_fans_pre_v10`, which reconstructs fans for pre-tree-UI ARs (no `tree_path` label); those still display as flat under each `conversation_tree_id` in V1.0. Pre-V1.0 ARs are bounded (existing corpus), V1.0-produced ARs are the dominant volume going forward, so the cut hits the right surface. +- **Reflog entries past the most-recent execution per node.** The local reflog cap (§6.6) is per-session; on reload, each node starts with reflog = `[]` and rebuilds from any subsequent Refresh. Backend ARs are still queryable in History; they just don't reappear in the per-node `executionHistory` array. +- **Per-fan `promotedChildSlotIndex` selections (V1.0).** V1.0 does not write Pick/Unpick state to backend labels; on reload, every fan returns to Synced. V1.1 adds `labels.promoted_slot_index` (cheap; one int per fan) to round-trip this. +- **Stack-`+` synced-peer membership (V1.1 only — moot in V1.0 since Stack-`+` is V1.1).** V1.1 reconstruction uses the `original_prompt_id` lineage chain rule from §9.3. + +**Pre-V1.0 fallback (V1.0).** If the labels-query at step 3 returns no rows AND `sessionStorage` has `pyrit.workspace.parentSourceConversationId. = Y`, the reconstruction falls through to `GET /api/attacks?conversation_id=Y` (legacy hydration) and rebuilds the same tree shape that `openTreeFromAttackResult(...)` (§13.1) produced. The minted treeId stays stable across the reload; the URL fragment, the sessionStorage entry, and the in-memory `ConversationTree.id` all agree. This catches the reload of a minted-but-never-refreshed tree (operator opened a pre-V1.0 AR, browsed, never refreshed, reloaded). If sessionStorage also has no entry (operator typed `?conversation_tree_id=X` into the address bar without ever opening the tree, or sessionStorage was cleared), reconstruction fails-soft to greenfield with a top-banner *"Tree `` not found. Start a new tree, or open from History."* — the same fail-soft path as a typo'd id. + +#### 9.4.2 The `beforeunload` guard (V1.0) + +To protect unsaved structural edits (the only loss case under §9.4.1): + +```ts +window.addEventListener('beforeunload', (e) => { + if (hasUnrefreshedEdits(workspace)) { + e.preventDefault() + e.returnValue = '' // Browser shows "Leave site?" dialog + } +}) + +function hasUnrefreshedEdits(ws: Workspace): boolean { + const tree = ws.currentTree + if (!tree) return false + return tree.nodes.some(n => n.state === 'edited' || n.state === 'draft') +} +``` + +~5 LOC. Mandatory in V1.0, not optional polish — without it, the operator's "Cmd+R to recover from a janky render" reflex destroys mid-edit work. + +#### 9.4.3 Concurrent-tab advisory lock (V1.0) + +Two browser tabs viewing the same `conversation_tree_id` can race the runner — each tab independently fires up to `maxParallel=4` POSTs, blowing the cap to 8 in-flight. The fix is a `BroadcastChannel`-based advisory lock keyed on `conversation_tree_id`. + +**Correctness note (revision 10):** an earlier draft used `MessageChannel` reply ports transferred through `BroadcastChannel.postMessage` with a transfer-list argument. That pattern fails at runtime — `BroadcastChannel.postMessage` only accepts a single message argument and does not support transferable objects (throws `DataCloneError` when passed a `MessagePort`). The correct pattern is request/reply correlation IDs on the same channel. + +```ts +const ch = new BroadcastChannel('pyrit-runner') +const heldLocks = new Set() // locks this tab holds +const tabId = uuid() // identifies this tab for diagnostics + +// Before a wave starts, try to acquire the lock for this tree: +async function acquireLock(treeId: ConversationTreeId): Promise<'acquired' | 'busy'> { + if (heldLocks.has(treeId)) return 'acquired' // already mine + const requestId = uuid() + const result = await new Promise<'busy' | 'acquired'>((resolve) => { + const handler = (e: MessageEvent) => { + if (e.data?.type === 'lock_busy' && e.data.requestId === requestId) { + ch.removeEventListener('message', handler) + clearTimeout(timer) + resolve('busy') + } + } + const timer = setTimeout(() => { + ch.removeEventListener('message', handler) + resolve('acquired') // no other tab responded; lock is ours + }, 50) + ch.addEventListener('message', handler) + ch.postMessage({ type: 'lock_request', treeId, requestId, tabId }) + }) + if (result === 'acquired') heldLocks.add(treeId) + return result +} + +// Respond to other tabs' lock requests when we hold the lock: +ch.addEventListener('message', (e) => { + if (e.data?.type === 'lock_request' && heldLocks.has(e.data.treeId)) { + ch.postMessage({ type: 'lock_busy', requestId: e.data.requestId, holderTabId: tabId }) + } +}) + +// On wave settle (success/failure/cancel): +function releaseLock(treeId: ConversationTreeId) { + heldLocks.delete(treeId) + ch.postMessage({ type: 'lock_released', treeId }) // wakes up any 'Wait'-polling tab +} +``` + +**Operator-visible behavior when a second tab tries to Refresh a tree another tab is mid-Refresh on:** + +> *"Another tab is refreshing this tree. [Refresh anyway] [Wait]"* + +`[Refresh anyway]` bypasses the lock (operator override; the only safe choice if the first tab crashed mid-wave); `[Wait]` listens for the `lock_released` message and auto-starts the new wave when it arrives. The wait state shows a spinner with *"Waiting for other tab to finish… [Cancel]"*. + +**Browser compatibility:** `BroadcastChannel` is supported in all modern browsers (Chrome, Firefox, Edge since launch; Safari 15.4+, March 2022). Operators on older Safari (≤15.3) see no cross-tab safety; the runner detects `typeof BroadcastChannel === 'undefined'` and skips the lock with a one-time console warning. Acceptable degradation: those operators get the V1.0 fork-bomb risk but the rest of V1.0 works. + +**Test scaffolding:** JSDOM does not implement `BroadcastChannel`. **V1.0 commits to polyfilling via the [`broadcast-channel`](https://www.npmjs.com/package/broadcast-channel) npm package (~5 KB)** loaded in the jest setup file (`frontend/src/setupTests.ts`); no per-test import needed because the polyfill registers as a global. Browser-mode test runners (Playwright, Vitest browser-mode) are not in the V1.0 stack — the polyfill keeps the test surface in jest-jsdom. The polyfill is dev-dependency only; production bundles use the browser's native `BroadcastChannel`. + +**Limitations:** + +- `BroadcastChannel` is advisory, not transactional. A crashed tab releases nothing; the operator override path handles this. +- Same-origin only. Cross-origin tabs (operator opens app in two different hostnames) can still race. Acceptable: operators rarely do this and the `RoundRobinTarget` ([round_robin_target.py:L15](../../../pyrit/prompt_target/round_robin_target.py#L15)) backend-side cap still provides a per-target backstop. +- ~50 ms acquire latency added to every wave start. Imperceptible relative to a typical 60-leaf refresh (10+ seconds). + +**Why advisory and not strict (DB-backed):** strict locking requires a backend route to issue and release leases keyed on `conversation_tree_id`. The route doesn't exist. Adding it is a fair chunk of backend work for a problem that only surfaces when an operator opens the same tree in two tabs — uncommon enough that advisory + override modal is the right cost/benefit for V1.0. V1.1 can promote to a DB-backed lease if needed. + +#### 9.4.4 Hard backend & frontend type dependencies for V1.0 + +Three type-system changes ship in V1.0 to support the runner's dispatch and the auto-reverse reconstruction. All three are mechanical; the V1.0 GUI PR set carries them. + +**(a) Frontend `CreateAttackRequest` extension — adds `prepended_conversation`.** The current frontend type at [frontend/src/types/index.ts:158-163](../../../frontend/src/types/index.ts) has only `target_registry_name`, `name`, `labels`, `source_conversation_id`, `cutoff_index`. The backend supports `prepended_conversation: list[PrependedMessageRequest] | None` (max 200 messages, per [backend/models/attacks.py:L221-L243](../../../pyrit/backend/models/attacks.py#L221)). The runner's entire dispatch (per [03 §3.3](03_runner.md#33-dispatch-step-leaf-sendnode--partition--create_attack--sequential-add_message-calls)) sends `prepended_conversation` per leaf — this is the central hard dep. Also add the matching `PrependedMessageRequest` type (not currently in frontend types) and the `original_prompt_id` field on `MessagePieceRequest` (already present at [index.ts:L217](../../../frontend/src/types/index.ts#L217)). ~10 LOC frontend-only; no backend change for this item. + +**(b) Backend DTO extension — extend `BackendMessagePiece` with `converter_identifiers` and `original_prompt_id`.** The two-field DTO extension carries the lineage data the runner needs: + +- Without `converter_identifiers` on the DTO, reload (§9.4.1) produces `UserTurnNode`s with empty converter pipelines — *indistinguishable from a turn that used no converter*. Operators have no way to see that the displayed tree is missing data. **Also load-bearing for `Fan(axis='converter')` reload:** [§9.3.1 variant-payload reconstruction](#931-fan-grouping-algorithm-v11--original_prompt_id-chain-flattening--wave_id-disambiguator) derives `variants[s].payload.converters` from each fan-child leaf's first user-turn `converter_identifiers`. Without the DTO ext, converter-fan reload silently corrupts every slot's converter list to `[]` and the next refresh fires without the operator's authored converters. +- Without `original_prompt_id` on the DTO, V1.0's `detect_fans_v10_plus` (§9.3.1) cannot read the lineage primitive it needs to wire `MessagePieceRequest.original_prompt_id` on prepended pieces (preserves lineage when the runner re-constructs ARs from cached pieces) and V1.1's `detect_fans_pre_v10` cannot run at all. + +**Sequencing:** the backend mapper PR ships **first** (before any V1.0 GUI PR). The change is small (~5 lines across `pyrit/backend/models/attacks.py`, `pyrit/backend/mappers/attack_mappers.py`, `frontend/src/types/index.ts`) and self-contained — adds two fields to a DTO; no behavior change. The V1.0 GUI PR set declares this as a build-time check (the auto-reverse code reads the fields; TypeScript fails if absent). + +**DTO field defaults** (explicit so reviewers don't infer): + +- `converter_identifiers: list[ComponentIdentifierField]` — default `[]` (empty list, not None). Pieces that never had a converter applied carry an empty list, distinguishable from "DTO missing the field" (which fails at the TypeScript boundary). The mapper copies directly from `piece.converter_identifiers`; the field is non-null on the domain side. +- `original_prompt_id: string` — default not applicable; per the [`_set_original_prompt_id_default` validator at message_piece.py:L182-L190](../../../pyrit/models/messages/message_piece.py#L182), persisted pieces *always* have a non-null `original_prompt_id` (the validator defaults it to `self.id` for fresh pieces). The DTO field is declared as `string` (not `string | null`) and the mapper copies directly; no defaulting needed in the mapper. + +#### 9.4.5 Hard backend dependency: relocate `_validate_operator_match` + +The V1.0 PR set carries the relocation only (Q.S.2 DECIDED V1.0: operator-as-tag, rev 18). Today's check has one problem the V1.0 PR closes; a second issue that earlier revisions wanted to "tighten" is now intentionally left as-is per the operator-as-tag framing. + +- **Today's check at [`attack_service.py:L693`](../../../pyrit/backend/services/attack_service.py#L693) reads from `piece.labels["operator"]`**, which is written by an `attack_mappers.py:L502` path that is `removed_in="0.16.0"`. After removal, the piece-label check silently no-ops; the server-side operator-isolation check disappears for tree-UI traffic, leaving only the UI posture. **This is the bug V1.0 closes.** +- **Today's check returns early when `request.labels` is absent or empty** (the `if not request.labels: return` at the top of the function). Earlier revisions proposed tightening this to reject anonymous requests against operator-owned ARs. **Rev 18 (per Q.S.2) keeps the early-return**: the operator tag is honor-system, not an auth claim, so anonymous requests pass through unchallenged. Tightening this would promote the tag to a claim, which V1.0 is not chartered to do; V1.1 multi-operator collaboration ([§13.8](#138-multi-operator-collaboration-v2)) revisits whether the tag should become a claim. + +**The V1.0 fix is single-part:** + +1. **Relocate** the source of the operator check from `piece.labels["operator"]` to `AttackResult.labels["operator"]` (resolved once per request via the AR id the conversation belongs to). Survives the 0.16.0 piece-label-write deprecation. + +The relocation is ~15 LOC plus tests. The V1.0 GUI PR set carries it because it's the only operator-lock-correctness story that survives 0.16.0; running V1.0 without the relocation leaves the server-side layer silently disabled and contradicts the §9.1 "visual lock + API lock" framing for the mismatched-tag case. + +**Sequencing enforcement.** The relocation PR targets `pyrit/backend/services/attack_service.py` and must merge **before** the V1.0 GUI PR. Two enforcement mechanisms ship together so the gate is not a manual coordination promise: + +1. **Backend version gate in the GUI.** The V1.0 GUI's startup health check ([App.tsx](../../../frontend/src/App.tsx) bootstrap) calls `GET /api/version` and parses a `min_compat` field; if `min_compat > installed_pyrit_version` (a constant baked into the GUI build), the GUI renders a maintenance banner: *"Tree view requires PyRIT 0.16.0+ with the updated operator-lock check. Detected: {version}. Update PyRIT to continue."* The backend PR bumps `min_compat` as part of its diff. Without the backend PR merged, the gate fires and the tree tab is unavailable — visible enforcement, not silent regression. +2. **PR review checklist.** The GUI PR's description carries three checkboxes: + - `[ ] Confirmed PyRIT backend PR # is merged and released as version >= 0.16.0`. + - `[ ] Confirmed [03 §11.2 labels round-trip test](../../doc/gui/design/03_runner.md#112-needs-the-backend-integration-tests) passes against the post-relocation backend.` This is the canary for the §4.3 labels-divergence invariant surviving the backend's `_resolve_labels` relocation; it fails loudly if the backend PR changed the existing-piece-label preference semantics under multi-piece `prepended_conversation`. + - `[ ] Citation refresh pass complete.` Re-grep every `attack_service.py:L`, `attacks.py:L`, `attack_mappers.py:L`, and `message_piece.py:L` reference in the three design docs against the post-relocation backend, refresh any line numbers that drifted (±10 lines on long files per the rev-15 reviewer spot-check). One-time cleanup; future PRs are responsible for keeping their own diff-adjacent citations honest. + + Reviewers don't approve the GUI PR without all three links. Belt and suspenders; redundant with mechanism 1 (build-time check) but cheap. + +**PR sequencing enforcement.** The backend relocation PR ships **before** the GUI PR that enables the tree-UI flag. Sequence: + +1. **PR 1 (PyRIT core, backend):** relocate `_validate_operator_match` to read from `AttackResult.labels["operator"]`. Includes unit tests covering the relocation (existing-piece-label behavior preserved when the AR-level label is absent for backward compat). **Does NOT tighten the no-labels early-return** — anonymous requests continue to pass through unchallenged per the operator-as-tag framing (Q.S.2). +2. **PR 2 (PyRIT core, DTO):** the §9.4.4 (b) `BackendMessagePiece` extension (`converter_identifiers`, `original_prompt_id` exposed on the DTO). +3. **PR 3 (PyRIT GUI):** the V1.0 tree-UI behind the `enableTreeUI` feature flag, with frontend types pulling in the new DTO fields (PR 2) and labeling its requests with `operator` (defended by PR 1 against same-shape mismatches). + +**Enforcement mechanism, in priority order:** + +- *Build-time check (mandatory):* PR 3's frontend types reference `BackendMessagePiece.converter_identifiers` directly; TypeScript fails the build if PR 2 hasn't landed. This catches the DTO dependency at compile time. +- *Startup assertion (mandatory):* the tree-UI module includes a one-time startup probe that calls `GET /api/version` (or any read endpoint) and inspects the returned API version. If the version is below the one that includes PR 1's relocation, the tree-UI **disables itself with a banner** ("Tree UI requires PyRIT core ≥ X.Y.Z — current Z is older; falling back to chat tab. Update PyRIT core to enable."). This catches the operator-lock dependency at runtime, defending against operators who somehow run a mismatched GUI/backend pair (dev env, partial rollout). +- *PR description (advisory):* PR 3's description explicitly lists PR 1 and PR 2 as merge-before-this dependencies. Reviewers can use the link to verify both have shipped. + +The build-time check is sufficient for PR 2 (compile failure can't be ignored). The startup assertion is what defends against PR 1's silent-no-op failure mode (the backend would still accept requests; the GUI just wouldn't be safely deployable). Both must land in the V1.0 PR set, not as follow-ups. + +**One caveat for V1.0 design accounting:** under the V1.0 runner's AR-per-leaf model, every `add_message` targets an AR the runner *just created* with its own labels. The relocated check never rejects this — the AR's operator label matches the request's operator label by construction. So the server-side check fires correctly but rarely produces actual rejections under V1.0 runner traffic; its main value is defending against non-tree-UI clients (e.g., another GUI session, an API caller) that set their `operator` label *correctly* but reach for tree-UI-owned ARs under a mismatched tag. Anonymous callers (no `operator` label) are out of scope by design per Q.S.2 (operator-as-tag). See [§9.1 V1.0 isolation-posture clarification](#91-operator-isolation-posture) for the operator-facing implications. + +#### 9.4.6 Remaining limitations (post-revision-9, V1.0) + +After Patches #1 / #3 / §9.4.1-§9.4.5, only two limitations remain in V1.0: + +1. **One tree visible at a time.** Patch #1 ships single-tab Workspace (§13.1 V1.0 variant); the full tab strip is V1.1. Operators who want side-by-side use two browser tabs (with the §9.4.3 advisory lock handling cross-tab safety). +2. **Edits-since-last-Refresh are lost on reload.** The §9.4.2 `beforeunload` guard makes this hard to do accidentally; intentional reload (operator clicks "Reload from server" or types `?conversation_tree_id=...` in the address bar) discards them as expected. + +The earlier revisions' "reload destroys everything" framing is gone. + +**Soft caps (unchanged from previous revisions):** + +- Warn at **200 leaf `Send` nodes** in the conversation tree. +- Refuse adding a fan-out that would push leaf count over **1000** without an explicit operator override. +- Justification: react-flow render ceiling + the §7.5 storage cost. With AR-per-leaf the *piece* cost is lower than revision 2, but the *AR* count is the new bottleneck (1000 rows in history filtered by `conversation_tree_id` is still browsable, but visibly slow). + +**Soft-cap enforcement surface (V1.0).** The caps are checked at two points: + +1. **Mutation-time** (the operator action that would breach): `addNode` / `regenerateFanChildren` / `branchToNewTree` (and V1.1 `branchToSubtree`) all compute the post-action leaf count via a tree-walk before committing. The 200-leaf warning fires as a non-blocking toast (*"This tree now has 240 leaves; performance may degrade past 200."*). The 1000-leaf refusal fires as a confirm modal: *"This action would create 1080 leaves, past the 1000-leaf safety limit. [Cancel] [Override and proceed]"*. Override is operator-recorded in the `Workspace.settings.overrides_acknowledged: string[]` (per-session list of acknowledged-warning tree-ids). +2. **Render-time** (defensive): the canvas-level ribbon ([02 §2.3](02_tree_ui_affordances.md#23-canvas-level-affordances)) shows a persistent yellow badge on any tree with leaf count >200: *"200+ leaves; consider Branch from here to scope."* The render path does not refuse to render — just nudges. + +The mutation-time check is the load-bearing one; render-time is defense against trees imported from History that already exceed the cap (e.g., a 1500-leaf historical attack auto-reversed). + +## 10. The "Tree" Tab - Linear + Graph in One Workspace + +Under §12.5, the tree view is a **new sibling tab** in the existing navigation ([App.tsx#L196-L230](../../../frontend/src/App.tsx#L196-L230)) named `'tree'`. The existing `'chat'` tab is unchanged. Inside the new tab there are **two coexisting views** of the same conversation tree: + +- **Graph view** - the react-flow tree from §8. Authoring surface for tree structure. +- **Linear view** - the existing `MessageList` + `ChatInputArea` from `ChatWindow.tsx`, rendered for the currently-selected leaf path. Selecting a leaf in the graph view sets this view's `activeAttackResultId` + `activeConversationId`. + +The two views are toggled (split-pane or tabbed switcher inside the tab - layout TBD). The intent is: graph view for structural reasoning ("which branches did I try?"); linear view for content reasoning ("what did the model say in branch X?"). + +### 10.1 The four existing chat actions map to tree-level operations + +| Existing button ([ChatWindow.tsx#L401-L475](../../../frontend/src/components/Chat/ChatWindow.tsx#L401-L475)) | ConversationTree-level equivalent inside the Tree tab | +|---|---| +| Copy to input | (no tree change - just populates the linear view's input box) | +| Copy to new conversation | Add a sibling `RootPromptNode` in the same conversation tree, seeded from the clicked message | +| Branch conversation | Add a sibling under an implicit `FanNode(axis='prompt')` at the clicked message's depth | +| Branch into new attack | `branchToNewTree(clickedMessageNode)` (new `conversation_tree_id`, new tab) | + +The existing `'chat'` tab continues to perform these as today (against `AttackResult`s with no `conversation_tree_id` label). The new `'tree'` tab promotes them to tree-level operations. + +### 10.2 Follow-up: morph animation between views + +The user flagged this as a desirable enhancement (§12.5): when switching from graph to linear, **animate** the surviving chat elements (those on the selected leaf path) into the linear view's layout, fading out non-path nodes. The reverse animation expands the chat back into the tree. + +This is a polish item, not a V1 blocker. It is technically tractable with react-flow + a transition library (Framer Motion or `react-spring`) by sharing element ids between the two views; the underlying state (`tree` + `selectedLeafId`) is already unified, so the animation has the data it needs. + +## 11. Future Work: ConversationTree Persistence + +Revision 2 promoted a 1-day spike to a V1 precondition. Under §12.0 the spike is **deferred to V2**, and so are all the features it would unlock (multi-tab sync, undo/redo, conversation tree sharing, tree history). V1 ships with the client-only mitigations in §9.4 and accepts the limitations. + +The original spike specification is preserved here as the starting point for V2. + +### 11.1 The spike (for V2) + +**Hypothesis:** `AttackResult.metadata` is already a flexible `dict[str, Any]` and is already mutated by existing flows ([attack_service.py#L376-L378](../../../pyrit/backend/services/attack_service.py#L376-L378), [attack_service.py#L487-L492](../../../pyrit/backend/services/attack_service.py#L487-L492)). Serializing the conversation tree to `metadata['conversation_tree']` (or, more likely under AR-per-leaf, to a new `conversation_tree_definitions` table keyed by `conversation_tree_id`) requires only modest backend changes. + +**Why this is V2, not V1:** AR-per-leaf (§12.1) decouples conversation trees from individual `AttackResult` rows. The natural V2 storage shape is a `conversation_tree_definitions` table keyed by `conversation_tree_id`, joined to `AttackResult` via `labels.conversation_tree_id`. That's a new table and new endpoints - a fair chunk of backend work that V1 deliberately avoids. + +**V2 measurements** (when we get there): + +1. Serialized conversation tree size - target ≤100 KB for the 60-leaf reference tree. +2. Round-trip latency for conversation tree CRUD endpoints - target <50 ms p50. +3. Concurrent writers: two tabs editing the same `conversation_tree_id`. Pick a conflict policy (likely last-write-wins with a `plan_version` field). +4. Migration: how do operators with existing V1 client-only trees upgrade? Best answer: they re-import via the "Open as tree" action in §9.4 (which is robust because V1 already writes `conversation_tree_id` labels). + +### 11.2 What V1 deliberately omits to keep V2 clean + +- **No `conversation_tree_node_id` in `MessagePiece.prompt_metadata`** (see §7.3). V2 can introduce `plan_node_ref_v2 = {conversation_tree_id, node_id, plan_version}` without competing with V1 noise. +- **No new endpoints.** Every V1 operation maps to an existing route. V2 introduces `conversation_tree_definitions` resource without conflict. +- **No `update_attack_result.metadata['conversation_tree']` writes.** V1 doesn't touch `AttackResult.metadata` at all from the runner. V2 is free to claim the key. + +## 12. Decisions and Open Questions + +The decisions made by the user in this round are baked above. Reasoning summaries are kept here for traceability - future contributors should know *why* each choice was made. + +### 12.0 ConversationTree persistence: client-only for V1 - DECIDED (V1.0) + +Spike from revision 2 deferred to V2. V1 conversation tree lives in React state. Trade-offs accepted: no multi-tab sync, no undo/redo, no shareable conversation trees, conversation trees lost on reload. Mitigations in §9.4 (banner + "Open as tree" re-import path). + +*Author note:* I do NOT think otherwise. The spike was the right de-risking move if we were committing to writing `conversation_tree_node_id` into the backend. Once V1 omits that write (§7.3), the orphan-pointer concern that motivated the spike disappears, and the client-only V1 ships cleanly with no backend liability for V2 to clean up. The cost is operator UX (banner, re-import on reload) and that cost is acceptable for an MVP. + +### 12.1 AttackResult-per-leaf - DECIDED (V1.0) + +Every leaf `Send` path produces its own `AttackResult`. Trees are grouped via `labels.conversation_tree_id`. Matches today's `handleBranchAttack` semantics. Trade-offs accepted: 60 leaves → 60 history rows (filterable by `conversation_tree_id`); offset by simpler runner, fewer piece copies, and uniform leaf-level operator/target locking. + +### 12.2 Concurrency budget: `maxParallel=4` per-session (V1.0) / per-Workspace (V1.1) with fair-share - DECIDED + +V1.0 uses a global `maxParallel=4` cap (§6.3 rule 4) **scoped per browser session** (with only one tree in the session per §1 V1.0 exclusions, this collapses to a per-tree cap). **V1.1 promotes the scope to per-Workspace** when the tab strip lands and an operator may have M open conversation trees — the total in-flight POST count to the backend never exceeds the shared cap; tree A and tree B share one dispatch queue. The runner uses **fair-share scheduling**: when picking the next ready leaf, it prefers the tree whose active wave has the fewest in-flight calls. This prevents a 60-leaf refresh on tree A from starving a 3-leaf refresh on tree B. + +Operator-visible consequence (V1.1): "Refresh tree A → click Refresh on tree B → both run" interleaves fairly rather than running both at full speed. Tree B's wave will feel slower while tree A is mid-refresh; the wave-completion toast (§8.1 of 02) accurately reports each wave's own count regardless of interleaving. Worth a one-line acknowledgement in the wave UX if confusion arises; not a redesign. V1.0 does not see this interleaving (one tree per session). + +**Why per-Workspace and not per-tree (V1.1).** The previous spec (per-tree budget) was correct when V1 was single-tree. §13 introduces Workspace with multiple open trees, and per-tree budgeting would let 10 open trees fire 40 simultaneous POSTs to the same target — day-1 rate-limit pain. Per-Workspace caps the worst case to the configured budget regardless of how many trees the operator has open. + +**Future consideration: per-target sub-budgets** (Option C from the decision review; V1.x). Per-target budgeting would let target A max out without affecting target B — most aggressive throughput-preserving behavior. Not on the immediate roadmap because (a) `RoundRobinTarget` ([round_robin_target.py:L15](../../../pyrit/prompt_target/round_robin_target.py#L15)) already handles cross-endpoint load distribution transparently below the runner, (b) operators who care can configure round-robin at the target layer today, and (c) per-target budgeting adds runner complexity (a budget *map* keyed by `target_registry_name` rather than a single number). Revisit if real operators hit cases where the shared budget bites and round-robin isn't enough. + +### 12.3 Layout: Buchheim-Walker via `d3-hierarchy` - DECIDED (see §8.2) — V1.0 (plain); main-path pinning V1.1 + +Revision 4 upgraded the original "custom DFS" recommendation to **Buchheim-Walker via `d3-hierarchy.tree()`** (~10 KB gzipped). The naïve DFS reserved `Σ width(children)` per parent and wasted horizontal space; Buchheim-Walker lets small subtrees nestle into large ones' gaps. Edge routing is orthogonal (`smoothstep`). Full reasoning in [02_tree_ui_affordances.md §4](02_tree_ui_affordances.md#4-layout); abbreviated rationale in §8.2. + +**V1.0 ships plain `d3-hierarchy.tree()`** (~10 KB + ~30 LOC). **Main-path pinning and adaptive stack-collapse-on-zoom land in V1.1** ([02 §4.3](02_tree_ui_affordances.md#43-recommendation-buchheimwalker--pinned-main-path--adaptive-collapse)). The V1.0 layout is determinate, tight (B-W's main property), and stable; pinning is a comfort feature for large trees, not a correctness one. + +### 12.4 No auto-scoring on Send - DECIDED (V1.0) + +There is no "default scorer runs on every message" concept in the GUI's `add_message` flow today (default scorers exist only inside `Scenario` orchestration at [scenario.py:L375-L410](../../../pyrit/scenario/core/scenario.py#L375-L410)). `ScoreNode` (§4.5) remains always explicit. Revisit when PyRIT introduces a default-scorer registry concept usable outside `Scenario`. + +### 12.5 Navigation: new sibling tab with dual view - DECIDED (V1.0; see §10) + +New `'tree'` tab in the sidebar (alongside `'chat'`, `'history'`, `'config'`). Inside the tab: graph view + linear view, toggleable. Existing `'chat'` tab unchanged. Follow-up: morph animation between graph and linear views (§10.2), polish-only. + +### 12.6 Migration: auto-reverse linear conversations to a tree - DECIDED (see §9.3) — V1.0 (linear+converter); V1.1 (fanout detection) + +Default behavior when opening an existing AR in the tree tab: synthesize `UserTurn` + `Send` pairs from each message, hydrate converter pipelines from `MessagePiece.converter_identifiers`. **V1.1 adds:** lift multi-conversation attacks into implicit `FanNode(axis='prompt')` branches at `original_prompt_id` divergence points. `ImportMessageNode` remains in the kind set for operators who want the fast read-only path. The V1.0 piece carries a soft DTO dependency on extending `BackendMessagePiece` with `converter_identifiers` (and pre-emptively `original_prompt_id` for V1.1) — documented in §9.3. + +### 12.7 Renderer: react-flow chosen, with the door open - DECIDED (V1.0; see §8.1) + +Per the §8.1 comparison table: ~45 KB gzipped is acceptable, custom node components are first-class, pan/zoom/keyboard nav are built-in (even if a11y needs reinforcement - §8.4). The `conversationTreeToReactFlow` adapter (§8.3) confines react-flow's API surface to one module, so swapping renderers later is one PR. Rolling our own would save ~40 KB at the cost of weeks of polish work - not worth it for V1. + +The a11y keyboard layer in §8.4 ships in V1 (the existing app is keyboard-accessible end-to-end and we cannot regress that). + +### 12.8 Cancellation: UI-level V1.0, backend-token V1.x - DECIDED + +**V1.0 ships a UI-level Cancel button** ([03 §9](03_runner.md#9-cancellation)): the wave-status banner shows `[Cancel]` during an in-flight wave; clicking flips a per-wave flag the runner checks at each `ready.popNext()` boundary. Already-dispatched leaf sequences complete (their `add_message` calls run to completion); undispatched leaves transition `running → cancelled`. The wave-complete toast reports counts of cancelled leaves alongside succeeded/failed. + +**V1.x adds backend-token cancellation** that aborts in-flight HTTP calls too. The backend `create_attack`/`add_message` routes have no cancellation token today; adding one is a small cross-cutting change. The V1.0 cancel-at-boundary covers the dominant operator cost (a 600-call refresh saves potentially hundreds of unstarted calls; only the in-flight 4 still complete). V1.x makes the cancel fully synchronous. + +### Genuinely-open questions + +- **Q.A:** Should the `conversation_tree_id` label be exposed in the existing `'chat'` tab's history view as a filter chip in V1, or wait for the new `'tree'` tab to ship first? *Author lean: ship the filter chip in V1 - it's a 1-line addition to the existing `HistoryFilters` type, and immediately useful even before the tree tab lands.* +- **Q.B:** When the operator deletes a conversation tree node that has executed leaves, what happens to the underlying `AttackResult`s? *Author lean: leave them in the backend (append-only model); the conversation tree deletion just orphans them from the tree view. They remain queryable in the history tab via their `conversation_tree_id`. Hard-deleting backend rows is out of scope.* + +## 13. Workspace and Worktrees - the data model + +> **Version scope (revision 9).** **V1.0 ships a minimal Workspace data model** — `{ currentTree: ConversationTree | null; recentTreeIds: ConversationTreeId[] }` — which holds exactly one foregrounded tree plus a small list of recent tree IDs for the "Switch tree" affordance ([02 §2.3](02_tree_ui_affordances.md#23-canvas-level-affordances)). The full **tab strip with `conversationTrees[]` and concurrent-tree dispatch is V1.1** — the V1.0 cut keeps the operator's mental model simple (one tree visible, switch via the ribbon) and unlocks `branchToNewTree` (§6.5) without paying for the tab-strip UI surface. +> +> **Why this revision flipped:** the previous revision deferred all of §13 to V1.1. That cascaded into deferring `branchToNewTree` because "the always-new-tab variant has nowhere to land in V1.0," which left V1.0 operators with no in-tree "preserve the original" affordance — they had to context-switch to the chat tab. The minimal Workspace (~30 LOC) shippable in V1.0 keeps `branchToNewTree` and only defers the tab strip (a UI surface, not a data model). +> +> **What V1.0 ships vs. what V1.1 adds:** +> +> | Concern | V1.0 (minimal Workspace) | V1.1 (full tab strip) | +> |---|---|---| +> | Active trees in React state | one (`currentTree`) | many (`conversationTrees[]`) | +> | Switching trees | "Switch tree" button → chooser popover over recent IDs | tab strip | +> | `branchToNewTree(node)` (V1.0/V1.1) | swap `currentTree` to clone; source re-openable from History | new tab in strip; source stays foregrounded if operator prefers | +> | `branchToSubtree(node)` (V1.1) | n/a — not in V1.0 | sibling subtree in same canvas (dashed edge style) | +> | Side-by-side comparison of two trees | two browser tabs + §9.4.3 advisory lock | tab strip + split-pane (V1.1+) | +> | Concurrency budget (§12.2) | per-session = per-tree (one tree visible) | per-Workspace fair-share | +> | Reload reconstruction (§9.4.1) | restores the URL-fragment tree | restores all tabs from `sessionStorage`-cached tab strip | +> +> The data model below describes the V1.1 full shape; the V1.0 variant is the same shape with `conversationTrees.length ≤ 1` at all times and the tab strip UI gated off. + +The git mental model in §6.8 covers the lifecycle vocabulary (commit, reflog, rebase, cherry-pick). This section covers the *data model* framing the user raised in revision 5: **each ConversationTree is a worktree, and the Workspace is the repository root.** The framing tightens the analogy — "tree as branch" was loose because trees have many tips; "tree as worktree" fits perfectly because worktrees have one HEAD per checkout and a DAG of reachable commits below it, which is exactly our shape. + +### 13.1 V1.0 minimal Workspace + +```ts +export interface Workspace { // V1.0 shape + currentTree: ConversationTree | null // the foregrounded tree; null = greenfield + recentTreeIds: ConversationTreeId[] // last ~10 tree IDs visited (persisted to sessionStorage) + settings: WorkspaceSettings // operator-tunable; loaded from sessionStorage with defaults +} + +export interface WorkspaceSettings { + reflogCapPerNode: number // default 50; hard max 200 (per §6.6) + confirmThresholdCount: number // default 20 (per [02 §8.1](02_tree_ui_affordances.md#81-the-v1-chain-preview-banner--confirm-modal--toast--drawer-panel)) + suppressConfirmModalThisSession: boolean // operator toggled "Don't ask again" (default false) +} +``` + +**`recentTreeIds` is persisted to `sessionStorage`** (~one JSON entry, key `pyrit.workspace.recentTreeIds`). The list survives accidental browser refreshes within a session; it does NOT survive closing the tab (which is correct — a fresh session starts empty, matching operators' "new exploration" expectation). The URL fragment `?conversation_tree_id=X` is the canonical source for *which* tree to restore on reload (§9.4.1); `recentTreeIds` is just the MRU list for the Switch-tree popover. + +**Settings persist similarly.** `WorkspaceSettings` is loaded from `sessionStorage` at boot with hard-coded defaults as fallback. Operator changes via a settings popover (canvas-level ribbon) write back immediately. + +**Schema versioning (V1.0 → V1.1+) — drop-on-mismatch.** All `pyrit.*` sessionStorage keys (`pyrit.workspace.recentTreeIds`, `pyrit.workspace.settings`, `pyrit.workspace.parentSourceConversationId.` per [§13.1 `openTreeFromAttackResult`](#131-v10-minimal-workspace), and the `pyrit.workspace.conversation_tree_id` URL-fragment fallback) are namespaced under a single version key: `pyrit.schemaVersion = '1'` for V1.0. On boot (step 0 of the [§9.4.1 reload-reconstruction sequence](#941-reload-reconstruction-v10)), the runner reads `pyrit.schemaVersion` first; if it is absent OR not equal to the current version, the runner wipes every key matching `pyrit.*` via `Object.keys(sessionStorage).filter(k => k.startsWith('pyrit.')).forEach(k => sessionStorage.removeItem(k))`, writes the current version, and surfaces a one-line toast: *"Saved settings were from a different version and have been reset."* The reload then proceeds with the keys absent (greenfield-equivalent for each), exactly the same fail-soft path the [§9.4.1 pre-V1.0 fallback](#941-reload-reconstruction-v10) already documents for a missing `pyrit.workspace.parentSourceConversationId.`. + +Why global + drop, not per-key migration: (a) sessionStorage is tab-scoped and wipes on tab close anyway, so the wiped data was already short-lived; (b) every wiped key is recoverable (settings revert to defaults; MRU rebuilds as the operator opens trees; `parentSourceConversationId.*` is only needed for the §9.4.1 reload of minted-but-never-refreshed trees, which already fails-soft to greenfield); (c) one version constant to bump per release that changes any persisted shape, no per-key migration code to maintain or test for partial-migration states. **Operator-visible cost of a V1.0 → V1.1 bump:** one toast, an empty MRU, default settings, and any minted-but-never-refreshed pre-V1.0 AR session is lost (operator re-opens from History). Acknowledged in [§1.2 V1.0 known limitations](#12-v10-known-limitations-sharp-edges-in-what-v10-does-ship). + +**Operations (V1.0):** + +- `openTree(treeId)` — if `hasUnrefreshedEdits(workspace)` returns true, show the dirty-edit modal (§13.1a) first. Then: load via auto-reverse (§9.3) from `GET /api/attacks?labels.conversation_tree_id=treeId`; set as `currentTree`; push prior tree's id onto `recentTreeIds` (capped at 10, FIFO). +- `openTreeFromAttackResult(attackResultId)` — the History tab's "Open as tree" affordance ([02 §5.12](02_tree_ui_affordances.md#512-open-a-historical-attack-auto-reverse)). Same dirty-edit guard. Inspects the source AR's `labels.conversation_tree_id`: + - **If present** (V1.0+ AR): delegates to `openTree(treeId)` with the labelled id. + - **If absent** (pre-V1.0 AR with no `conversation_tree_id` label): mints a fresh `ConversationTreeId` via `crypto.randomUUID()`, hydrates the in-memory tree from `GET /api/attacks/{attackResultId}/messages?conversation_id=ar.conversation_id` via the linear-chain reconstruction path (§9.3), sets `ConversationTree.parentSourceConversationId = ar.conversation_id` so reload can locate the legacy source, and sets as `currentTree`. The URL fragment immediately writes `?conversation_tree_id=`; sessionStorage writes `pyrit.workspace.parentSourceConversationId. = ar.conversation_id` so the §9.4.1 reload fallback can find the legacy AR. **Until the first Refresh, no backend write has happened** — the minted id is operator-local; the first Refresh fires `create_attack + N add_message` with the minted id in `labels.conversation_tree_id`, and the resulting per-leaf AR rows in History are the first persisted references to the tree. +- `newTree()` — same dirty-edit guard. Create empty `ConversationTree`; set as `currentTree`. +- `closeTree()` — same dirty-edit guard. Set `currentTree = null` (returns to greenfield). The closed tree's id stays in `recentTreeIds` for re-opening. +- `branchToNewTree(node)` — **exempt from the dirty-edit guard** (rev 11). The clone is created via deep-copy (§6.5), so the source's `edited` `params` and `edited` `state` are carried into the clone; nothing is lost in-session. **The source's `undoStack` is also deep-copied into the clone** (rev 16, per reviewer Finding 4) so the operator can still Ctrl-Z the carried `edited` state inside the clone — without this, an accidental `📋` click would permanently lock in every structural edit the operator made before clicking, since the source's `undoStack` is itself cleared on tree-swap. Set the clone as `currentTree`; push source's id onto `recentTreeIds`. *Caveat:* the SOURCE tree, if re-opened later via Switch tree or History, will reflect the last refreshed state — unsaved source-tree edits live only inside the clone after branching. Operators discarding the clone (close, then never re-open) effectively discard those edits. Documented in the toast text ("Branched from . Source tree's unsaved edits AND undo history are carried into this clone; source resets if you re-open it later."). `branchToSubtree(node)` (V1.1) is similarly exempt because the cloned slice lives in the same canvas — no swap, nothing is lost. + +### 13.1a Dirty-edit guard on tree swap (V1.0) + +The §9.4.2 `beforeunload` guard catches reload/tab-close but NOT in-app tree swaps (`openTree`, `newTree`, `closeTree`). Without an in-app guard, an operator with 3 edited `UserTurnNode`s in tree A who clicks **"Switch tree"** to load a recent one loses those edits silently — the swap is a pure React state mutation, no browser event fires. (`branchToNewTree` is exempt per the §13.1 operations spec — the clone deep-copies the source's `edited` state, so nothing is lost; the source's unsaved edits live inside the clone after branching.) + +```ts +function hasUnrefreshedEdits(ws: Workspace): boolean { + const tree = ws.currentTree + if (!tree) return false + return tree.nodes.some(n => n.state === 'edited' || n.state === 'draft') +} + +async function guardedSwap(ws: Workspace, swap: () => void): Promise { + if (hasUnrefreshedEdits(ws)) { + const confirmed = await showModal({ + title: `Unsaved edits in "${ws.currentTree!.displayName}"`, + body: `You have ${countUnrefreshed(ws)} unsaved edits that will be lost when switching trees. Refresh the tree first to persist them as AttackResults, or continue to discard.`, + buttons: [ + { label: 'Cancel', value: false, default: true }, + { label: 'Discard and continue', value: true, destructive: true }, + ], + }) + if (!confirmed) return + } + swap() +} +``` + +~15 LOC plus the modal component (which already exists for the cost-guardrail). **Three of the four `Workspace`-mutating operations** (`openTree`, `newTree`, `closeTree`) funnel through `guardedSwap`. **`branchToNewTree` bypasses the guard** per the §13.1 exemption — the clone deep-copies the source's `edited` state, so nothing is lost in-session (the source's unsaved edits live inside the clone after branching). V1.1 `branchToSubtree` is also exempt (the cloned slice lands in the same canvas — no swap, nothing is lost). The dirty-edit predicate is the same one §9.4.2 uses. + +**Why not auto-save the edited edits.** V1.0 has no server-side tree persistence; the only place to "save" structural edits is to fire them as Refreshes, which costs tokens. Asking the operator before discarding is the right tradeoff — they can `Cancel` and click `Refresh tree` first to persist, then come back to swap. + +**UI surface (V1.0):** + +- Canvas-level ribbon ([02 §2.3](02_tree_ui_affordances.md#23-canvas-level-affordances)) has a **"Switch tree"** button. Clicking opens a popover listing `recentTreeIds` (each rendered with the source tree's display name); selecting one calls `openTree(id)`. +- The ribbon also surfaces `currentTree.conversation_tree_id` as a chip with a "Copy" affordance — operators can paste the id into the URL of a second browser tab for the §9.4.3 multi-tab workflow. +- A **settings popover** in the ribbon exposes `reflogCapPerNode` and `confirmThresholdCount` for operator tuning. +- No tab strip in V1.0. + +**Operator-visible quirks (acceptable for V1.0):** + +- The clone-via-`branchToNewTree` swaps the canvas without animation; the operator sees their tree replaced by the clone. The toast (*"Branched from . Source tree's unsaved edits are carried into this clone; source resets to last refreshed state if you re-open it later."*) sets the expectation. Operators who want side-by-side use two browser tabs. +- Closing the current tree clears the canvas; the operator can re-open from "Switch tree" or History tab. The §13.1a guard catches lost-edits cases. +- **V1.0 → V1.1 affordance migration cost:** the `📋` button's V1.0 behavior (swap the canvas) differs from V1.1's (open a new tab in the strip). Operators who learn V1.0's muscle memory will need to re-acquaint once V1.1 ships. One-time cost; documented in the V1.1 release notes when that ships. + +### 13.2 V1.1 conceptual mapping (tab strip) + +``` +git | CoPyRIT tree view +----------------------------+-------------------------------------------------- +Repository (object store) | Backend `AttackResult` + `MessagePiece` rows + | (append-only, shared across all worktrees; + | filtered by `labels.conversation_tree_id` in History) +Worktree | One ConversationTree (one tree-view canvas instance) +HEAD per worktree | Per-leaf `execution: ExecutionRecord` on each Send +Branch ref (.git/refs/...) | A node's `execution` field; the per-node "tip" +Working directory | The mutable tree node params (text, attachments) +Index / staging area | (none — edits are immediate; no staging concept) +Reflog | `executionHistory: ReflogEntry[]` per node (§4.6 wraps each ExecutionRecord with a per-tree `pinned` flag) +`git worktree add` | `branchToNewTree(tree.root)` (UI label "Clone tree") — lifted + | into a new ConversationTree in the Workspace's conversationTrees[] list +`git worktree list` | The tab strip in the 'tree' view (one tab per ConversationTree) +`git worktree remove` | Close-tree affordance: drops the ConversationTree from + | React state; backend rows persist +``` + +### 13.3 The Workspace type (V1.1 full shape) + +The V1.1 React state container holds many trees plus an active-tab pointer: + +```ts +export interface Workspace { // V1.1 shape + conversationTrees: ConversationTree[] // each ConversationTree has its own conversation_tree_id (its worktree id) + activeConversationTreeId: ConversationTreeId // which tree tab is foregrounded + /** + * Optional cross-worktree state. V1.1 has none — every conversation tree is independent. + * V2 may track "this conversation tree is a clone of that conversation tree" via parent_conversation_tree_id labels + * (already written to AttackResult labels per Q.A.1 resolution). + */ +} + +export interface ConversationTree { + id: ConversationTreeId // === conversation_tree_id; one stable UUID per ConversationTree + nodes: ConversationTreeNode[] + edges: ConversationTreeEdge[] + rootId: ConversationTreeNodeId + displayName: string // operator-editable; defaults to root prompt's first 40 chars + createdAt: string + /** + * Set at clone time by `branchToNewTree` (§6.5); the source tree's id. `null` for trees + * created via `newTree()` or restored from History without a parent context. The runner's + * `_build_labels` helper ([03 §3.3a](03_runner.md#33a-helpers-referenced-by-the-dispatch-step)) + * reads this field and writes `labels.parent_conversation_tree_id` on every leaf AR of a + * cloned tree, so History "where did I fork this from" navigation works without server-side + * state. Once set, never modified; clones-of-clones overwrite (the most-recent parent wins). + */ + parentConversationTreeId: ConversationTreeId | null + /** + * Set at Open-as-tree time by `openTreeFromAttackResult` (§13.1) when the source AR is + * pre-V1.0 (no `conversation_tree_id` label). Carries the source AR's `conversation_id` + * so [§9.4.1 reload-reconstruction](#941-reload-reconstruction-v10) can locate the legacy + * AR via the fallback path when the labels-query returns no rows. Mirrored into + * sessionStorage at `pyrit.workspace.parentSourceConversationId.` for the + * reload-fallback lookup. Once the first Refresh has fired, the labels-query returns + * rows and the field becomes redundant for reload purposes, but it is kept for History + * navigation (operator can see "this tree was reconstructed from AR "). + * `null` for trees created via `newTree()`, `branchToNewTree()`, or `openTree()` on a + * V1.0+ AR with a real `conversation_tree_id` label. + */ + parentSourceConversationId: string | null + /** + * In-memory inverse-op stack for Ctrl-Z structural undo per [§6.9](#69-node-editor-undo-v10). + * Cap N=20, FIFO eviction. Cleared on tree-swap (openTree/newTree/closeTree). **Carried + * into the clone by `branchToNewTree`** alongside the source's edited state, so the + * operator can Ctrl-Z carried edits inside the clone (rev 16 / reviewer Finding 4). + * NOT persisted to sessionStorage — reload loses it, same contract as edits-since-last-Refresh. + * V1.x may add a parallel redoStack; the field name stays `undoStack` to keep the V1.0 + * → V1.x migration a pure addition. + */ + undoStack: UndoOp[] +} +``` + +**V1.0 → V1.1 migration cost:** the V1.0 `Workspace` is a strict subset (`conversationTrees = currentTree ? [currentTree] : []`; `activeConversationTreeId = currentTree?.id`). V1.1 promotes the field and adds the tab strip UI; no data migration. The runner, layout engine, propagation logic, and render pipeline all operate on `ConversationTree`, not `Workspace` — so the change is contained to the React state container and the tab strip UI. + +### 13.4 What's mutable, what's append-only + +This is the question revision 5 raised: do we keep all history edits, or allow mutable tree structure with append-only executions? + +**V1 answer: hybrid (Model C below).** ConversationTree structure is mutable; ExecutionRecords are append-only. + +| Concern | What's preserved | What's mutable | +|---|---|---| +| **`ExecutionRecord`** (runs) | Append-only in backend; per-node `executionHistory` (capped at `REFLOG_CAP_PER_NODE`, default 50, configurable per-Workspace — §6.6) keeps the local reflog | — | +| **ConversationTree node params** (text, attachments, converter pipeline, target) | The *currently-displayed* params; old values not tracked | Yes — operator edits replace prior values | +| **ConversationTree structure** (which nodes exist, where they sit in the tree) | The *current* structure; deletions are permanent | Yes — delete a fan, delete a UserTurn, etc. | +| **Workspace** (which Conversation trees are open) | Current set; closing a ConversationTree discards its in-memory React state | Yes — operator opens/closes/clones conversation trees | +| **Cross-ConversationTree references** | `labels.parent_conversation_tree_id` on cloned AttackResults; persists in backend; surfaces in History | (not mutable; set at clone time) | + +**Three model options considered (and rejected for V1):** + +| Model | Idea | Reject reason | +|---|---|---| +| **A: Status quo (this is V1)** | Mutable conversation tree + append-only executions; clone is the answer for preservation | **Chosen** | +| **B: Full version control on conversation trees** | Every edit to a conversation tree node creates a new version; conversation tree itself is append-only (CRDT-like) | Substantial complexity for a problem operators may not have. Undo/redo via a simple React-state stack (V1.x) is the 10% solution. | +| **C: Mutable conversation tree + explicit `frozen: boolean` per node** | Operator marks a node as immutable; propagation stops at frozen nodes | Adds a new propagation rule (stop-at-frozen), complicates edited/stale logic, and risks the operator forgetting which nodes are frozen. **Branching (§6.5) already provides preservation without per-node ceremony.** Revisit if real operators report needing fine-grained freeze. | + +The "clone is the answer" pattern keeps the propagation rules simple (every edited edit cascades to every clean descendant; no frozen carve-outs) and matches git's actual workflow (preserve a branch by creating a worktree, not by marking files read-only). + +### 13.5 Worktree operations — what changes from revisions 1-4 + +Three operations sharpen under the worktree framing. Everything else is unchanged. + +**Branching is the worktree operation.** Two API functions cover the concept (§6.5): `branchToNewTree(nodeId)` (V1.0) for "clone the whole tree" (clicking the root) or "branch from this specific node into a new tree" (clicking any other node); `branchToSubtree(nodeId)` (V1.1) lands the cloned slice as a sibling within the same canvas. V1.0 ships only the new-tree variant — clicking `📋` swaps the Workspace `currentTree` to the clone; the source is re-openable from History. Revisions 4-6 had only the sibling-subtree mode; revision 7 dropped it; revision 8 reintroduced it for V1.1 with disambiguated edge rendering; revision 9 brought the always-new-tree variant forward to V1.0; revision 14 split the two landing modes into separate API functions to force explicit call-site choice. + +``` +Before clone: After clone (Workspace view): + Workspace Workspace + └─ ConversationTree A (tab active) ├─ ConversationTree A (tab, no longer active) + └─ tree with #4 promoted │ └─ tree with #4 promoted + └─ ConversationTree B (tab, active) + └─ same tree shape; #4 still promoted + (operator can now promote #7 instead) +``` + +The clone is structurally identical to the source until the operator diverges either side. Backend ExecutionRecords are shared (no duplication); the two conversation trees both reference the same AR ids until re-execution. + +**Open historical attack.** Previously: opens in the existing canvas (auto-reverse per §9.3). **Now: opens as a new ConversationTree tab.** Multiple historical attacks can be open simultaneously as separate worktrees. + +**Tab strip in the 'tree' view.** Each ConversationTree is a tab. Tab close = `git worktree remove` (ConversationTree drops from React state; backend rows persist; can be re-opened via History → "Open as tree"). Tab reorder = drag-and-drop, purely visual. Tab rename = inline edit on the ConversationTree's `displayName`. + +### 13.6 What this does NOT change + +To keep the revision tight, here is what the worktree framing **does not** introduce: + +- **No backend changes.** Workspace is purely a React-state container. Each ConversationTree still writes `labels.conversation_tree_id` on its own ARs (per §12.1 of revision 3). The History view groups by `conversation_tree_id` as before. +- **No new endpoints.** Same set as §7. +- **No `frozen` field.** Rejected above; revisit only if real operators ask. +- **No conversation tree version log.** Rejected above; undo/redo via React state stack is V1.x at most. +- **No cross-tree operations** (merge, fast-forward, rebase-onto-other-conversation-tree). These would be V2+ territory and would require the merge primitive that V1 explicitly excludes. +- **No mobile / narrow-viewport story** (Q.A.5 from revision 4 is still deferred — see [02_tree_ui_affordances.md §8](02_tree_ui_affordances.md#8-long-term-vision-navigable-whiteboard-canvas)). + +### 13.7 Worked example: pursuing two attempt picks in parallel + +The user's revision 5 scenario: "I want to explore both attempt #4 and attempt #7 from the same 10-attempt fan." + +**Old answer (revisions 4-6):** Snapshot the root inside ConversationTree A → two sibling subtrees in the same canvas → set `promotedChildSlotIndex` differently in each. (Revision 7 dropped this mode; revision 8 reintroduces it for V1.1 with disambiguated edge rendering, see §6.5 "Two landing modes".) + +**V1.0 answer (via §6.5 + minimal Workspace §13.1):** `branchToNewTree(treeA.root)` swaps the canvas to ConversationTree B (source A goes to History) → set `promotedChildSlotIndex=7` in B's root fan. Operator uses "Switch tree" or a second browser tab (with the §9.4.3 advisory lock) to flip back to A and compare; ExecutionRecords are shared between A and B until divergence. + +**V1.1 answer (full tab strip):** `branchToNewTree(treeA.root)` opens ConversationTree B as a new tab → set `promotedChildSlotIndex=7` in B's root fan while A keeps `promotedChildSlotIndex=4`. Operator flips between the two tabs in the strip; no swap. ExecutionRecords are shared between A and B until divergence. + +The V1.1 answer is cognitively cleaner because the tab strip makes the "I have N parallel hypotheses live" state visible at a glance; V1.0 trades that for the "Switch tree" chooser, which is a discoverable-enough fallback for the first release. + +### 13.8 V2 directions (not committing yet) + +When V2 lands server-side conversation tree persistence (§11), the worktree framing extends naturally: + +- **Persist the Workspace**, not just one ConversationTree. Operators can `git pull` their workspace from any browser. +- **Share conversation trees across operators** via `labels.conversation_tree_id` indirection — equivalent to `git push`/`git fetch` of a worktree. Concurrency model: last-write-wins with `plan_version`. +- **Cross-ConversationTree refresh** (V2.1+): "refresh ConversationTree B's root prompt against ConversationTree A's current root prompt" — useful for "apply this change across all my experiments". Conceptually a cross-tree rebase. Requires careful UX to make sure the operator can preview before committing. +- **ConversationTree history / reflog at the ConversationTree level**, not just per-node: every Workspace mutation (addConversationTree, closeConversationTree, structural edits) becomes a log entry. True undo/redo. CRDT-style merge if multi-operator editing lands. + +None of this is V1. V1.0 is: Workspace = `{ currentTree; recentTreeIds }`, ribbon Switch-tree affordance, clone swaps the canvas. V1.1 is: Workspace = `{ conversationTrees: ConversationTree[]; activeConversationTreeId }`, tab strip in the 'tree' view, clone creates a new tab. + +## 14. Refresh Waves - grouping per-node executions into a user-intent unit + +Revision 6 promoted worktrees to V1 (§13). Once an operator has multiple worktrees and large fan-outs, a single click of "Refresh tree" produces dozens of new `ExecutionRecord`s across many leaves. Without grouping, those records are an unsorted soup. Git solves this implicitly — `git log` shows a rebase as a contiguous range of new commits because they share authorship/timestamp metadata. We solve it explicitly with a `waveId`. + +### 14.1 The data model addition + +```ts +export interface ExecutionRecord { + // ... existing fields ... + + /** + * Identifier of the refresh wave that produced this ExecutionRecord. All + * ExecutionRecords created by one `refreshSubtree` / `refreshTree` / + * `refreshNode` call share the same `waveId`. A single isolated refresh + * (one node, one execution) still gets a waveId so wave-grouped views can + * treat it uniformly. + * + * Null only for the very first synthetic ExecutionRecord created at + * auto-reverse time (§9.3) where the refresh concept does not apply. + */ + waveId: string | null + + /** + * Snapshot of when the wave started (not when this individual execution + * completed). For a wave of 60 leaves, all 60 ExecutionRecords share + * `waveStartedAt`; their individual `attemptedAt` timestamps differ. + * Used to sort waves by recency in the workspace timeline. + */ + waveStartedAt: string | null + + /** + * The *kind* of operator action that triggered the wave. String enum, not a + * node ID — we deliberately avoid stamping a `ConversationTreeNodeId` here + * because those IDs are client-only (§12.0) and become orphan pointers after + * reload (the same leak §7.3 explicitly disavows for piece metadata). + * + * Operators get the in-memory `ConversationTreeNode` reference for free in + * the live UI (toast "View wave", Recent waves drawer) because the wave was + * just created. After reload, the trigger node is gone with the rest of the + * tree; the *kind* survives and is what operators filter History on. + */ + waveTriggerKind: + | 'refresh_node' // V1.0 — absorbs `initial_send` (first auto-Send) and `fan_expand` (single-variant refresh) + | 'refresh_subtree' // V1.0 — absorbs `fan_axis_change` (regenerates fan children) and `rerun_multiple` (↻×N attempt-fan children) + | 'refresh_tree' // V1.0 — absorbs `branch_rebase` (operator's first refresh of a cloned tree) + | 'retry_failed' // V1.0 — operator clicks Retry-failed in the wave-complete toast; preserves "this wave was a retry" audit signal vs. a fresh action + | 'synced_peer_add' // V1.1 — Stack-`+` adds a synced peer set, runner refreshes all peers + | 'cross_tree_rebase' // V2.1+ — cross-tree refresh (conceptually a rebase across worktrees); wire-level name preserved per [02 §3.5 git mental model](02_tree_ui_affordances.md#35-git-mental-model) +} +``` + +**Why this enum is small.** Revision 15 (per reviewer Finding 1) collapsed an earlier 11-value enum down to four V1.0 values. The dropped values — `initial_send`, `fan_expand`, `fan_axis_change`, `branch_rebase`, `rerun_multiple` — each collapsed into one of the three core verbs (`refresh_node`, `refresh_subtree`, `refresh_tree`) based on which runner entry point the UI action actually invokes; the inline comments above name the mapping. The audit-side trade-off: the History tab cannot filter "first send vs. operator-rebased clone vs. fan-axis change" — they all read as one of the three verbs. What's kept: which runner entry point fired, plus whether this wave was a retry (the only audit signal that doesn't derive from the call site). Revisit if real-operator audit requests surface a distinction we collapsed. + +**Note:** there is intentionally no `'make_current'` variant. `makeCurrent` is a pure pointer swap — no ExecutionRecord, no wave. The subsequent (operator-chosen) refresh of the now-stale descendants is the wave-generating event, and it carries the refresh action's own kind (`refresh_subtree`). + +And one corresponding addition to the AR label set: + +```python +# In the runner, before each POST /attacks: +ar_labels["wave_id"] = wave_id # UUID v4, set once per refresh call +ar_labels["wave_started_at"] = iso_timestamp +ar_labels["wave_trigger_kind"] = trigger_kind # string enum; never a UUID +``` + +`wave_id` joins `conversation_tree_id` and the existing operator/operation labels on every AR. No backend schema change — `labels` is already `dict[str, str]` per [attacks.py](../../../pyrit/backend/models/attacks.py). + +### 14.2 What this enables + +| View | Where it lives | Backed by | +|---|---|---| +| **"View wave" toast** after refresh | Bottom-right toast (V1) | In-memory `waveId` of just-completed wave | +| **Recent waves panel** inside a ConversationTree | Drawer tab next to "Past runs" (V1) | Per-ConversationTree list of distinct `waveId`s, newest first | +| **Per-node reflog popover** with wave grouping | Node `⟲ N` badge popover (V1, per Q.7.B in 02) | `ExecutionRecord.waveId` groups the popover rows | +| **History tab "Group by wave"** toggle | Existing History tab (V1.x) | SQL `GROUP BY labels.wave_id` over `AttackResult`s | +| **Tree-local diff view** (split cards: previous wave vs. current) | ConversationTree canvas, opt-in via "Compare to previous wave" | Per-node read of last two `waveId`s' ExecutionRecords | +| **Workspace timeline** (swimlanes per ConversationTree, waves as stripes) | New view, V2 | Cross-ConversationTree query: all `wave_id`s across `conversation_tree_id`s with timestamps | + +### 14.3 What this does NOT change + +- No backend schema change. `labels` is a flexible `dict[str, str]` already. +- No new endpoints. `wave_id` is set by the runner at POST time; queryable via the existing `?label=wave_id:X` filter on `/attacks` (the [`label` query param](../../../pyrit/backend/routes/attacks.py#L100-L106) is already a multi-value filter). +- No change to propagation, lifecycle, or fan-out semantics. +- No change to `executionHistory` GC (the 10-entry cap, §6.6) — waves cross executions; the cap stays per-node. + +### 14.4 Wave ID generation - one rule + +A `waveId` is generated **once per top-level operator action**, not once per resulting POST: + +| Operator action | `waveId` behavior | +|---|---| +| Single-node `refreshNode(id)` | Generate one `waveId`; stamp the single new ExecutionRecord and AR | +| `refreshSubtree(rootId, ...)` | Generate one `waveId`; stamp every ExecutionRecord/AR produced under this call | +| `refreshTree()` | Generate one `waveId`; stamp every ExecutionRecord/AR; `waveTriggerKind = 'refresh_tree'` | +| Stack `+` add-to-all + auto-refresh | Generate one `waveId`; covers all N synced children's new sends | +| Restart a *failed* node after the wave finished | New `waveId`; `waveTriggerKind = 'retry_failed'` (it's a new operator intent, even though the original wave already wrote its waveId to all the *successful* leaves) | + +**Note:** `makeCurrent` itself does not generate a wave — it's a pure state-pointer swap (§6.7 step 6) with no ExecutionRecord write. If the operator subsequently invokes `refreshSubtree` to re-run the now-stale descendants, *that* refresh generates a wave whose `waveTriggerKind` is whatever the refresh action's kind is (`refresh_subtree`). There is no `'make_current'` variant. The authoritative `WaveTriggerKind` enum is defined in [§14.1](#141-the-data-model-addition) above; refer to it for the complete list. + +This rule keeps the operator's mental model simple: **one click = one wave**. + +### 14.5 Why not derive waves from timestamps post-hoc? + +Considered and rejected. Clustering ExecutionRecords by timestamp proximity would mis-group concurrent edits in different conversation trees, mis-split slow refreshes that took longer than the clustering window, and require an arbitrary window-size choice with no good answer. Stamping `waveId` at refresh-call time is ~3 LOC, exact, and forward-compatible with any view we want to build. + +## 15. Audit posture - what V1 records and what it doesn't + +V1 of the tree UI is a red-teaming tool, and red-teaming tools are audited. Security teams ask: *"what was sent to which target, by whom, when, with what result?"* This section names what V1 records, what it doesn't, and where the gap lands on the roadmap. + +### 15.1 What V1 audits (per-leaf AR is the record-of-record) + +Every wave the operator triggers produces one `AttackResult` per leaf `Send` (per §7.2 AR-per-leaf). Each AR carries the full audit trail: + +- **Who:** `labels.operator` (set by the runner on every `POST /attacks`; durable post-0.16.0 per §9.1 / §7.4). +- **What:** every `MessagePiece` of the prepended conversation plus the leaf's assistant response, with their original/converted values, MIME types, and converter chain. +- **When:** AR `created_at` + per-message `created_at` timestamps; plus `labels.wave_started_at` so the auditor can group leaves by the operator click that produced them. +- **Where to:** `target_type`, `endpoint`, `model_name` captured in the AR's `target` field. +- **Why (intent):** `labels.wave_id` joins all ARs from one operator action; `labels.wave_trigger_kind` names *which kind* of action (per §14, e.g. `refresh_subtree`, `refresh_node`, `retry_failed`). +- **Lineage:** `prepended_conversation` pieces carry `original_prompt_id` chains so the auditor can trace every leaf back to its source. `labels.conversation_tree_id` groups all ARs from one tree; `labels.parent_conversation_tree_id` chains cloned trees back to their parent. + +**Net audit posture vs. today's chat:** strictly better. Today's chat has operator/target/lineage labels but no wave grouping (every `add_message` looks isolated). V1 adds wave grouping and tree grouping at zero cost to the audit story. + +**Client-side telemetry policy (V1.0, per rubber-duck Finding C.7).** V1.0 emits **no operator-behavior telemetry from the client** — no hover events, no modal-dismissal counters, no draft-abandon tracking, no `Switch tree` invocation counts, no debounce-drop logs. The only client-emitted observability is the per-leaf `ExecutionRecord` timing fields ([§4.6](#46-shared-types)) and the [03 §6.3 WaveEvent](03_runner.md#63-wave-events) stream, both of which describe *target interactions* (audit-relevant) rather than *operator UI behavior* (not audit-relevant for V1.0's red-teaming-tool context). V1.x adds opt-in operator-behavior telemetry via a Workspace settings toggle once the V1.x telemetry surface lands per [03 §12 Q.5](03_runner.md#12-open-questions); the V1.0 commitment to no-tracking-by-default removes the *"is the tree-UI watching me?"* question from internal-deployment threat models. + +### 15.2 What V1 does NOT audit (conversation tree structure is ephemeral) + +The conversation tree itself — the structure of nodes, edges, fans, stacks, and the operator's editing history within them — lives in client-only React state per §12.0. The audit-invisible operations are: + +- **Authoring without execution.** Operator builds a 60-node tree but only refreshes 5 of them. Audit shows the 5 refreshed leaves' ARs; the other 55 nodes leave no backend trace. +- **Delete operations.** Operator deletes 30 nodes from a tree (per the §5.16 delete-branch scenario). The underlying ARs that resulted from past refreshes of those nodes remain in the History tab; the *act of deleting* and *which nodes were deleted from the tree view* leaves no trace. +- **Param-edit history within a node.** Operator types "X", refreshes, types "Y", refreshes. The two ARs (from X and from Y) both persist with their respective inputs; the operator's intermediate edits between refreshes are not recorded. +- **Reflog browsing.** Operator clicks `⟲` and reads three past runs but doesn't `Make current`. The browsing leaves no trace. +- **Fan-axis exploration that doesn't reach a Send.** Operator builds a `FanNode(axis='converter')` with 5 variants but never refreshes the resulting Sends. No ARs produced; no audit trail. + +**Net acknowledged gap:** the auditor sees *what was sent and what came back*; they do not see *the shape of the operator's exploration*. For most red-teaming audit-of-record needs (regulatory traceability, harm-event triage, "show me every prompt that target X received from operator Y last week"), the existing per-leaf AR data is sufficient. + +### 15.3 Roadmap: V1.x option for structural audit + +If real-world audit asks come in (especially around "what did the operator try before they found this jailbreak?"), V1.x can opt into Option B from the decision review: **stamp `AttackResult.metadata['conversation_tree_slice']` with a snapshot of the root-to-leaf slice of the conversation tree that produced this AR.** Concretely: + +- Cost: one small backend extension (`CreateAttackRequest.metadata_overrides` from §7.4) + ~50 lines of runner code to serialize the slice. +- What it adds: every per-leaf AR carries a JSON blob describing the conversation tree path that produced it (which nodes, which fan-variant choices, which converter pipelines). The auditor can reconstruct the exploration that led to a specific leaf without needing the conversation tree to be server-side. +- What it still doesn't audit: discarded branches (no AR exists), reflog browsing, deletes without prior refresh. +- Why not V1: requires the `metadata_overrides` backend extension currently deferred, and pulls the V1 PR set into a backend dependency. Cleaner to ship V1 with the §15.1 / §15.2 acknowledgement and add §15.3 when an actual audit ask arrives. + +When V2 server-side conversation tree storage lands (§11), structural audit becomes essentially free — the conversation tree itself IS the structural record, persisted and queryable. §15.3's interim stamping then has a clear V2 successor. + +### 15.4 What V1 does provide for security teams today + +A short list, for the audit checklist: + +1. **All prompts sent are queryable** via the existing History tab filtered by `labels.operator`, `labels.operation`, date range, target, or any combination. +2. **Wave grouping** (new in V1) lets the auditor isolate "what one operator click did" — filter History by `labels.wave_id`. +3. **Tree grouping** (new in V1) lets the auditor isolate "what one conversation tree produced" — filter History by `labels.conversation_tree_id` or chase clone chains via `labels.parent_conversation_tree_id`. +4. **Operator isolation** is server-enforced via `_validate_operator_match` (today on piece labels, post-0.16.0 on AR labels per §9.1 / §7.4). Cross-operator `add_message` calls are rejected at the backend. Under V1.0 AR-per-leaf the check rarely fires for tree-UI traffic by construction — it is defense-in-depth against non-tree-UI clients per §9.1. +5. **Append-only memory** means no AR is ever destroyed by tree-UI operations — delete-from-tree is a UI op, not a backend deletion. + +These five together cover what a security team typically asks for from a red-teaming tool's audit story. Conversation-tree-structure audit (§15.3) is the explicit V1.x escalation path if real-world asks exceed what the per-leaf AR record provides. + +## Appendix A — Worked Example: "Same prompt, 5 attempts, 3 converters" + +``` +RootPrompt(text="how do I bake bread?", target=gpt-4o) +└─ Fan(axis='converter', variants=[Base64, ROT13, NoOp]) + ├─ slot 0: UserTurn(converterPipeline=[Base64]) + │ └─ Fan(axis='attempt', variants=[{},{},{},{},{}]) + │ ├─ slot 0: Send (attempt #1) → AR_001 (labels.conversation_tree_id=T) + │ ├─ slot 1: Send (attempt #2) → AR_002 + │ ├─ slot 2: Send (attempt #3) → AR_003 + │ ├─ slot 3: Send (attempt #4) → AR_004 + │ └─ slot 4: Send (attempt #5) → AR_005 + ├─ slot 1: UserTurn(converterPipeline=[ROT13]) + │ └─ Fan(axis='attempt', variants=[{},{},{},{},{}]) + │ └─ ... (5 more ARs) + └─ slot 2: UserTurn(converterPipeline=[NoOp]) + └─ Fan(axis='attempt', variants=[{},{},{},{},{}]) + └─ ... (5 more ARs) +``` + +15 leaf `Send` nodes → **15 `AttackResult`s, all carrying `labels.conversation_tree_id=T`**. Each AR is created via `POST /attacks` with `prepended_conversation` = the resolved input from root to that leaf (a single user message carrying the converted "how do I bake bread?"); then `POST /attacks/{id}/messages` runs the actual Send and gets the assistant reply. + +The operator edits the root prompt → root becomes `edited`, all 15 leaves become `stale` (§6.3 propagation rule 1). The operator clicks "Refresh tree" → runner walks down with `maxParallel=4` (per-Workspace; §12.2), executes all 15 (creating 15 *new* ARs because the resolved input changed, so the old ones are preserved as part of `executionHistory` and remain visible in history with the old `conversation_tree_id`). Marks all `clean`. + +Storage cost per §7.5: 15 ARs, 15 conversations, 30 messages (15 user-prepended + 15 assistant). History view shows 15 rows for this `conversation_tree_id` - the operator typically filters by `conversation_tree_id` chip to scope. + +## Appendix B — Worked Example: "Crescendo-style multi-turn with backtrack" + +``` +RootPrompt(text="initial benign question", target=gpt-4o) +└─ Send → AR_X turn 1 + └─ UserTurn(text="follow-up #1") + └─ Send → AR_X turn 2 (appended to the same AR — linear chain) + └─ Fan(axis='converter', variants=[NoOp, Rephrase, Translate]) + ├─ slot 0: UserTurn(converterPipeline=[NoOp]) + │ └─ Send → AR_Y_1 (new AR; prepended_conversation = AR_X's 2 turns + NoOp variant) + ├─ slot 1: UserTurn(converterPipeline=[Rephrase]) + │ └─ Send → AR_Y_2 + └─ slot 2: UserTurn(converterPipeline=[Translate]) + └─ Send → AR_Y_3 +``` + +Result: 4 `AttackResult`s (AR_X with 2 turns + 3 leaf ARs), all sharing `labels.conversation_tree_id=T`. Note that the linear chain at the top is one AR with 2 turns; only the fan boundary spawns new ARs. The Crescendo backtracking pattern ([crescendo.py#L66](../../../pyrit/executor/attack/multi_turn/crescendo.py)) is expressible as nested `Fan(axis='converter')`s after each refusal. The operator can edit one branch's follow-up text without disturbing the others. + +## Appendix C — Worked Example: "Sweep over targets" + +``` +RootPrompt(text="explain photosynthesis", target=) +└─ Fan(axis='target', variants=[gpt-4o, claude-3.5-sonnet, llama-3-70b]) + ├─ slot 0: Send → AR_1 (target=gpt-4o) + ├─ slot 1: Send → AR_2 (target=claude-3.5-sonnet) + └─ slot 2: Send → AR_3 (target=llama-3-70b) +``` + +3 `AttackResult`s, all sharing `labels.conversation_tree_id=T`. Under AR-per-leaf this is mechanically identical to any other fan axis (§9.2). The Fan node still renders a "spawns 3 attack results" indicator (§9.2 bullet 3) because the operator is creating 3 history rows. + +--- + +## Next Steps (in order) + +This document defines the **shape of the conversation tree**. + +1. **Types module + skeleton react-flow shell.** Land the TypeScript primitives from §4 + a non-interactive renderer with custom DFS layout (§8.2) that draws a hard-coded tree. Validates visual ergonomics before any execution wiring. +2. **ConversationTree-to-backend runner.** Implement `refreshNode` / `refreshSubtree` mapping to existing `attacksApi` calls per §7, using `prepended_conversation` for leaf-AR materialization. +3. **Inline editor + propagation.** Implement `editParams` with `edited`/`stale` propagation per §6.3. +4. **Branching.** Implement `branchToNewTree` per §6.5 (pure tree op; no backend call until refresh). V1.1 adds `branchToSubtree`. +5. **Operator isolation posture + auto-reverse migration.** Per §9.1, §9.3. +6. **`conversation_tree_id` label everywhere + history filter chip** (Q.A). Adds the chip in the existing `'chat'` tab's history view as a parallel landing strip for tree work. +7. **Soft caps + a11y keyboard layer.** §8.4 keyboard nav + §9.4 soft caps. + +Items deferred to V1.1 / V2: + +- Backend cancellation token (§12.8). +- Server-side conversation tree persistence (§11). +- Per-node morph animation between graph and linear views (§10.2). +- Auto-scoring on Send (§12.4, gated on a default-scorer concept landing in PyRIT). diff --git a/doc/gui/design/02_tree_ui_affordances.md b/doc/gui/design/02_tree_ui_affordances.md new file mode 100644 index 0000000000..7059ed3ba1 --- /dev/null +++ b/doc/gui/design/02_tree_ui_affordances.md @@ -0,0 +1,1233 @@ +# Tree-Based UI — Affordances, Layout, and Scenarios + +> Status: **DRAFT for review (revision 18)** — companion to [01_tree_primitives.md](01_tree_primitives.md). +> Scope: UX affordances, layout algorithm, scenario walkthroughs. +> Out of scope: data model (covered in primitives doc), implementation code, visual style. +> One primitives-level addition is requested here (§6); the rest is pure UX. +> Rolling revision history lives at [01 §0](01_tree_primitives.md#0-rolling-revision-history); refer there for cross-doc change summaries. + +### Version-scope legend + +This doc and [01_tree_primitives.md](01_tree_primitives.md) share the same version markers. See [01_tree_primitives.md §0 legend](01_tree_primitives.md#version-scope-legend) for definitions. + +The condensed V1.0 surface area (revision 9): +- **Nodes:** `RootPrompt`, `UserTurn`, `Send`, `ScoreNode`, `FanNode(axis ∈ {attempt, converter})`. +- **Stacks:** Fan-Children Stack (§3.1) only — Synced-Peers Stack and Stack-`+` gating are V1.1 (§3.2, §3.4a). V1.1 design treated as provisional pending V1.0 operator feedback. +- **Layout:** plain Buchheim-Walker via `d3-hierarchy.tree()` — main-path pinning is V1.1 (§4.3). +- **Branching:** `branchFromNode` always-new-tree variant **ships in V1.0** via the minimal-Workspace data model ([01 §13.1](01_tree_primitives.md#131-v10-minimal-workspace)); clicking `📋` swaps the active tree to the clone. The sibling-subtree variant (`🌿`, V1.1) renders as disabled stub in V1.0. The full tab strip is V1.1. +- **Auto-reverse:** linear chain + per-piece converter pipelines from history ships V1.0. Multi-conversation fanout-detection ([01 §9.3.1](01_tree_primitives.md#931-fan-grouping-algorithm-v11--original_prompt_id-chain-flattening--wave_id-disambiguator)) is V1.1. +- **Reload reconstruction:** restores `currentTree` from URL fragment via auto-reverse ([01 §9.4.1](01_tree_primitives.md#941-reload-reconstruction-v10)). The `beforeunload` guard ([01 §9.4.2](01_tree_primitives.md#942-the-beforeunload-guard-v10)) protects unsaved edits. `BroadcastChannel` advisory lock ([01 §9.4.3](01_tree_primitives.md#943-concurrent-tab-advisory-lock-v10)) prevents two-tab fork-bombs. +- **Pick / Unpick:** ships in V1.0 against fan-children (single `promotedChildSlotIndex` per FanNode) — without the synced-peers draft-placeholder dance from §3.3, which is V1.1. +- **Reflog cap:** `REFLOG_CAP_PER_NODE = 50` (configurable per-Workspace, see [01 §6.6](01_tree_primitives.md#66-executionhistory-gc-the-reflog)); eviction is operator-visible. + +## 1. Design Principles + +These four principles drive every decision below. + +1. **Familiar first.** The existing four chat-message buttons ([MessageList.tsx#L308-L420](../../../frontend/src/components/Chat/MessageList.tsx#L308-L420)) — *copy to input, copy to new conversation, branch conversation, branch attack* — are already in operators' muscle memory. Tree-view affordances should map onto these or replace them with something obviously better, never confuse them with a new vocabulary. +2. **Edge-affordances over modal buttons.** Adding a node into the middle of a chain is something operators want to do often. A `+` button that *appears between two nodes when you hover the edge* (the pattern used by n8n, Zapier, Linear's workflows) is cheaper than a "select node, click Insert After, pick type" modal flow. +3. **Stacks are the unit of repetition.** A `FanNode` with N identical-looking children is visual noise. The Stack — a single rendered card that *contains* N synchronized subtrees — is how the UI represents a fan that hasn't been edited per-child yet. The user's "drag follow-up over the fanned-out messages" intuition is this same concept. +4. **One canonical action per intent.** "Run this prompt 10 times" is one user intent. The UI should not require the operator to *choose between* "add Fan, axis=attempt" and "click re-run 9 times". Re-run multiple **promotes** to a Fan automatically. + +--- + +## 2. Affordance Inventory + +### 2.1 Per-edge: insert-on-edge `+` + +The single most important affordance. When the operator hovers an edge between two nodes (or the empty space below a leaf), a translucent `+` chip slides in mid-edge. Clicking it opens a popover: + +``` + Send ✓ + │ + │ + ← hover affordance, click to open + │ + ╔═══╧═════════════════════╗ + ║ Insert after this Send ║ + ║ ║ + ║ ▸ Follow-up user message║ (UserTurn, role=user) + ║ ▸ Inject assistant text ║ (UserTurn, role=simulated_assistant) + ║ ▸ Score ║ (ScoreNode) + ║ ▸ Fan out: ... ║ (submenu: attempt / prompt / converter / target) + ╚══════════════════════════╝ +``` + +The same affordance, hovered between a `UserTurn` and a `Send`: + +``` + UserTurn: "How do I bake bread?" + │ + │ + ← popover changes contextually + │ + ╔═══╧═════════════════════╗ + ║ Insert after this turn ║ + ║ ║ + ║ ▸ Send to target ║ (rare — usually auto-inserted) + ║ ▸ Append converter ║ (modifies the UserTurn's pipeline) + ║ ▸ Fan out: converter ║ (wraps in a Fan) + ║ ▸ Fan out: prompt ║ + ╚══════════════════════════╝ +``` + +**Why context matters in the popover:** the legal next-node types depend on the upstream node's kind. After a `Send` you almost always want a follow-up or a fan; after a `UserTurn` you usually want a converter or send. Hiding illegal options is cheaper than enabling-with-error. + +### 2.2 Per-node action rail + +A small action row floats below each node card on hover/focus. Icons only when collapsed; labels appear on hover-of-the-icon. + +> **Version scope.** Every icon below ships in V1.0 unless explicitly marked **V1.1**. V1.1-marked icons render as disabled in V1.0 with a tooltip pointing to the V1.0 fallback (where one exists). Disabled-in-V1.0 affordances keep their slot reserved so V1.1 is a state flip, not an introduction (the rationale is "don't create a V1.0 trigger that V1.1 would then repurpose"; see [01 §6.5](01_tree_primitives.md#65-branch-from-node---the-immutable-history-primitive)). + +**Common to every node:** + +| Icon | Action | Version | Notes | +|---|---|---|---| +| `↻` | Refresh | V1.0 | Per §6.3 in primitives. Long-press / shift-click opens `Refresh subtree`. **Cost-preview tooltip (rev 18, per rubber-duck Finding D.3):** every `↻` button's hover-tooltip carries an estimated-call-count for the wave it would trigger, computed cheaply at render time from the stale-set (e.g., *"Refresh subtree (≈60 calls, 5 leaves)"*). Same estimator the [§8.1 cost-modal](#81-the-v1-chain-preview-banner--confirm-modal--toast--drawer-panel) reads. Cures the *"operator dismisses the modal once, then learns to ignore it"* failure mode by surfacing cost on hover before the click commits to a modal. ~30 LOC of tooltip wiring; high asymmetric value. | +| `📋` | Branch from here / Clone tree | **V1.0** | Per §6.5 in primitives. **V1.0 lands** by swapping the Workspace's `currentTree` to the clone; source is re-openable from History. **V1.1 lands** as a new tab in the tab strip. Label: **"Clone tree"** on root, **"Branch from here"** otherwise. | +| `🌿` | Branch as subtree (same canvas) | **V1.1** | Per §6.5 in primitives. Lands the cloned slice as a sibling subtree of the source node in the *same* ConversationTree, no tab switch. **V1.0:** rendered disabled with tooltip *"Available in V1.1"*. The slot is reserved here so V1.1 enablement does not introduce a new trigger that conflicts with `📋`. Branch-glyph chosen for visual distinctness from `📋` (clipboard-glyph) — the two icons sit adjacent on every node's action rail and operators must not mistake them. | +| `🗑` | Delete | V1.0 | Confirmation modal; preserves backend `AttackResult`s under same `conversation_tree_id` (§5.16 below) | +| `🔍` | Open in linear view | V1.0 | Switches the linear pane to focus on this node's path; the tree view stays loaded (§10 in primitives) | + +**`RootPromptNode`-specific:** + +| Icon | Action | Version | +|---|---|---| +| `✏` | Edit prompt + target + system prompt (inline editor) | V1.0 | +| `📎` | Add attachment | V1.0 | + +**`UserTurnNode`-specific:** + +| Icon | Action | Version | +|---|---|---| +| `✏` | Edit text inline | V1.0 | +| `🔀` | Wrap in `FanNode(axis='prompt')` with this turn as variant #0 — the user's "shuffle" intuition | **V1.1** (depends on `prompt` axis; see [01 §4.4](01_tree_primitives.md#44-structural-nodes--the-single-fan-out-primitive)). **V1.0:** rendered disabled. | +| `⚡` | Open converter palette (adds to `params.converterPipeline`) | V1.0 | +| `≡` | Change role (`user` ↔ `simulated_assistant` ↔ `system`) | V1.0 | + +**`SendNode`-specific:** + +| Icon | Action | Version | +|---|---|---| +| `↻` | Re-run (single — one more attempt, recorded in `executionHistory`) | V1.0 | +| `↻×N` | Re-run multiple — **promotes to `FanNode(axis='attempt', variants=[…])` automatically** (§3.1 below) | V1.0 | +| `🎯` | Change target (per-node override) | **V1.1** (depends on `target` axis; rendered disabled in V1.0) | +| `💬` | View raw response panel (right-hand drawer) | V1.0 | +| `★` | Pin as "main" path leg (visual emphasis; see §4.3 layout) | **V1.1** (main-path pinning deferred — see §4.3). **V1.0:** the icon is not rendered at all (no V1.0 trigger to reserve; the centerline layout pass simply doesn't exist yet, so there is nothing the operator's flip-of-a-flag would activate). | + +**`FanNode`-specific:** + +| Icon | Action | Version | +|---|---|---| +| `+` | Add another variant | V1.0 | +| `≡` | Change axis (only legal before any children have executed; otherwise destructive op with confirmation) | V1.0 (axis choices limited to `attempt` and `converter` in V1.0 per [01 §4.4](01_tree_primitives.md#44-structural-nodes--the-single-fan-out-primitive)) | +| `⊟` / `⊞` | Collapse to Stack / Expand to per-child cards (§3 below) | V1.0 (Fan-Children Stack only; Synced-Peers Stack is V1.1) | +| `↻` | Refresh all children (parallel, respects `maxParallel`) | V1.0 | + +**`ScoreNode`-specific:** + +| Icon | Action | Version | +|---|---|---| +| `✏` | Configure scorer + params | **V1.1** (depends on `runScorer(node_id)` per [01 §4.5](01_tree_primitives.md#45-observational-nodes-no-side-effect-on-the-conversation)). **V1.0:** rendered disabled with tooltip *"Scorer configuration is V1.1; V1.0 displays scores already attached to upstream pieces."* Slot reservation against UX regression. | +| `📊` | View score distribution (across all leaves in current subtree) | V1.0 | + +### 2.3 Canvas-level affordances + +- **Top-left ribbon:** + - `+ New tree` (when canvas is empty) + - `← Linear view` toggle (switches the right pane to the linear chat; tree stays in the left pane) + - `conversation_tree_id` chip + `Open in History` link + Copy affordance (the §9.4.3 two-tab workflow pastes this into a second browser tab) + - **`Switch tree`** button (V1.0; §13.1 minimal-Workspace surface). Opens a popover listing the Workspace's `recentTreeIds`; selecting one calls `openTree(id)` and the canvas swaps. *V1.1 replaces this with the tab strip.* + - Operator label + - **Wave status:** when nodes are edited/stale, shows `"1 edited, 60 stale · ~60 calls · [Refresh tree]"`. During an in-flight wave, shows progress + cancel: `[ ●●●●●●○○○○ ] 6/60 (3 ✓, 0 ⚠, 0 ⏱, 1 ⦾, 1 ●) [Cancel]` — the five-value tail is `succeeded / failed / rate-limited / blocked / running`. `⏱ rate-limited` counts leaves whose `failure_class='rate_limited'` per [03 §3.3a `_format_api_error`](03_runner.md#33a-helpers-referenced-by-the-dispatch-step) (HTTP 429 or provider-specific overloaded shapes). `⦾ blocked` counts leaves dropped from `ready` by the [03 §5.3](03_runner.md#53-cascade-on-failure) in-flight cascade (an ancestor failed earlier in this wave). Cancel calls `runner.cancelWave(treeId)` per [03 §9](03_runner.md#9-cancellation); button transitions to disabled `[Cancelling…]` while in-flight leaves drain; the toast then reads *"Wave cancelled: 6 ✓, 0 ⚠, 0 ⏱, 1 ⦾, 54 cancelled. [View wave]"*. When the per-tree queue ([03 §10.3](03_runner.md#103-backpressure-per-tree-wave-queue)) is non-empty, a separate `[Cancel queued]` chip appears on the same banner and calls `runner.cancelQueued(treeId)` — drops queued waves without touching the active one. After a wave completes the toast in §8.1 takes over. + - **Deep-chain warning** (V1.0 §1 V1.0 exclusions): when the deepest path in the current tree reaches 180 turns, the ribbon shows *"This conversation is approaching the 200-turn ceiling. Use Branch from a midpoint to keep extending."* with a quick-action chip that scrolls to a midpoint UserTurn and arms its `📋` button. +- **Bottom-right minimap** (react-flow built-in) showing the full tree with a viewport rectangle. +- **Bottom-left zoom controls** + a `Fit to view` button (also a keyboard shortcut `F`). +- **Right-side action drawer** (slides in when a node is selected) — tabs: + - `Current` — params editor + most recent execution. + - `Past runs (Reflog)` — per-node reflog popover content (Q.7.B). + - `Recent waves` — ConversationTree-scoped wave list (§8.2); always available regardless of which node is selected. + - `Compare` — V2 (§8.5). +- **Wave completion toast** (bottom-right, transient): `"Wave complete: 57 ✓, 3 ⚠, 0 ⏱, 0 ⦾, 0 ✋. [View wave]"` — see §8.1. The five-value tail is `succeeded / failed-retryable / rate-limited / blocked / needs-fix`. The `✋ needs-fix` count (rev 18, per rubber-duck Finding B.3) surfaces leaves whose `failure_class='permanent'` per [03 §3.3a](03_runner.md#33a-helpers-referenced-by-the-dispatch-step) (HTTP 4xx that aren't 429 — schema rejection, content policy block, malformed request); these are excluded from `[Retry failed]`'s `nodeIds` because clicking won't help. Without a distinct count, an operator who clicks `[Retry failed]` and sees the `failed` count not decrease has no surfaced explanation (the prior 4-class-to-3-bucket asymmetry silently dropped `permanent` into `failed` and left operators hovering chips to find out why retry was a no-op). `⏱ rate-limited` surfaces leaves whose `failure_class='rate_limited'` (HTTP 429 + provider-specific overloaded shapes); the [Retry failed] button is **disabled when every failed leaf is rate-limited** (operator must wait for the target's rate-limit window to clear, then click Refresh tree manually). When in-flight cascade ([03 §5.3](03_runner.md#53-cascade-on-failure)) drops sibling leaves of a failed ancestor, the toast surfaces them as `⦾ blocked` (distinct from `⚠ failed`). The [Retry failed] button starts a fresh wave that retries `failure_class='transient'` failures and their blocked descendants; rate-limited leaves are excluded and remain failed in the wave summary. +- **Reflog eviction summary** (V1.0; §6.6 of primitives): when the runner evicts unpinned reflog entries during a wave, the count is **aggregated into the wave-complete toast** rather than firing per-eviction markers (which would stack and push the toast off-screen). The toast reads: *"Wave complete: 57 ✓, 3 ⚠. Past runs evicted: 12. [View wave]"*. Single-eviction events outside a wave (e.g., `makeCurrent` displacing an entry while at cap, §6.7) fire a single transient marker for ~8 seconds: *"Past run evicted from node X. [Pin evicted run] [Increase cap]"*. *Operator-facing terminology uses "past run(s)"* per the friendly-first §7 Q.7.A convention; "reflog" appears only in code, data-model docs, and the right-click git-alias menu. +- **Multi-tab busy modal** (V1.0; §9.4.3 of primitives): when this tab attempts a Refresh but another tab holds the advisory lock for this `conversation_tree_id`, a modal appears: *"Another tab is refreshing this tree. [Refresh anyway] [Wait]"*. +- **Operator-tag-required modal** (V1.0; [03 §2.1 entry-point shim step 1](03_runner.md#entry-point-shim-ordering-v10) + [01 §9.1 isolation posture layer 2](01_tree_primitives.md#91-operator-isolation-posture)): when the operator clicks Refresh tree / Refresh subtree / Refresh node while `currentOperator()` returns null/empty (the operator never set a tag this session, or cleared it from the ribbon), the runner aborts pre-dispatch and emits a `WaveEvent { kind: 'operator_tag_required' }`. The UI surfaces a modal: *"Operator tag required. This refresh would create AttackResults with no operator tag, which makes them hard to find in History and breaks per-operator isolation. Set your operator tag in the top bar, then click Refresh again. [Set operator tag] [Cancel]"*. `[Set operator tag]` focuses the ribbon's operator-tag input; `[Cancel]` dismisses; either way, no backend call has fired, no cross-tab lock was acquired, AND the cost-preview modal is suppressed (it would normally fire as shim step 3, after the lock acquire at step 2; the tag gate at step 1 returns first). *Note: `operation` (§15 audit tag) is NOT gated — operators mid-experiment may genuinely refresh without an operation set; a top-banner reminder surfaces when `operation` is empty but the wave proceeds.* +- **Ctrl-Z structural undo** (V1.0; per [01 §6.9](01_tree_primitives.md#69-node-editor-undo-v10)). Ctrl-Z (or Cmd-Z on macOS) inside the canvas pops the last structural edit — add/delete/editParams/regenerateFanChildren/makeCurrent — from the per-tree undo stack (capped at 20 entries, FIFO eviction). **Native input undo unaffected:** when a node's textarea has focus, Ctrl-Z does typing-level undo (browser default); operators press Esc to blur the textarea before structural Ctrl-Z reaches the canvas handler. Tree-swap clears the stack; reload loses it (matches the [01 §9.4.1](01_tree_primitives.md#941-reload-reconstruction-v10) reload-loss contract for edits). No redo in V1.0 — Ctrl-Shift-Z lands V1.x if operators report needing it. + +### 2.4 Per-stack affordances + +When a Fan is in Stack rendering (§3), the stack itself has its own action rail at its bottom edge: + +| Icon | Action | Version | +|---|---|---| +| `+` | Add a synchronized child to all members of the stack (the "fan-through" case — §5.6) | **V1.1** (depends on Synced-Peers Stack — §3.2). **V1.0:** rendered disabled with tooltip *"Available in V1.1"*. | +| `⊞` | Expand stack to show per-child cards | V1.0 | +| `🎯` | "Pick one" — promote one member (sets `FanNode.params.promotedChildSlotIndex`, dims the others) | V1.0 (without the V1.1 draft-placeholder dance from §3.3 — V1.0 just dims the non-promoted children) | +| `↻` | Refresh all children | V1.0 | + +--- + +## 3. The Stack — Two Distinct Visual Aggregations + +The doc previously described "the Stack" as one concept with two uses. The second-pass review of decision #3 showed they are **two distinct render rules** that often coexist in the same tree but follow different predicates and have different operator semantics. Naming them separately removes a real source of confusion. + +| | **Fan-Children Stack** (§3.1) | **Synced-Peers Stack** (§3.2) | +|---|---|---| +| What it groups | Direct children of one `FanNode` whose subtrees look identical (typically `attempt`-axis) | N nodes added together via Stack-`+` (the §5.6 fan-through pattern), wherever they live in the tree | +| Trigger | Automatic on render when the predicate holds | Operator clicks the synced-peer Stack's `+` affordance | +| Underlying field | None — pure derivation from `parentId` + structural identity | `addedToStack: boolean` on each peer (see §6.1) | +| Edit semantics | None — fan-axis variants ARE the per-child differences, there is nothing to "sync" | Stack-edit propagates to all peers via parent-walk peer detection | +| Decomposes when | A child's subtree shape differs from peers | A peer's `params` differs from peers (divergence is implicit) | + +Both can apply at different layers of the same canvas. The §5.6 scenario has *both* — the fan card aggregates 10 identical Send children (Fan-Children Stack), and below them sit 10 synced UserTurns added by Stack-`+` (Synced-Peers Stack). + +### 3.1 Fan-Children Stack — visual aggregation only + +When a `FanNode` has N children with **identical recursive subtree structure** (e.g., right after creation of an `attempt` fan, or after a "Refresh all"), the UI does not render N separate cards. It renders one card with a multiplicity badge: + +``` + UserTurn: "How do I bake bread?" + │ + ▼ + ┌─────────────────────────────────┐ + │ Fan: axis=attempt, n=10 │ + │ │ + │ ┌─────────────────────────┐ │ + │ │ Send ×10 │ │ ← Fan-Children Stack: 10 Sends + │ │ "9 ✓, 1 ⚠" │ │ shown as one card with + │ │ ▶ expand to see each │ │ aggregate status + │ └─────────────────────────┘ │ + └─────────────────────────────────┘ +``` + +Compare to expanded rendering: + +``` + ┌─────────────────────────────────┐ + │ Fan: axis=attempt, n=10 │ + │ ┌──────┐┌──────┐┌──────┐... │ + │ │Send✓ ││Send✓ ││Send⚠ │ │ ← per-child cards: visual sprawl + │ └──────┘└──────┘└──────┘ │ + └─────────────────────────────────┘ +``` + +Stack rendering is the default; expand-on-demand. **Collapse to Stack** is auto-applied when N>3 and all children are structurally identical; otherwise expanded. + +**There is no data-level synchronization here.** Fan-axis children of `prompt`/`converter`/`target`/`system_prompt`/`temperature` are deliberately *different* (the variant payload IS the difference), so they never collapse — only the `attempt` axis produces a collapsible Fan-Children Stack in practice. None of these children carry `addedToStack`; the aggregation is a pure render rule keyed on `parentId` + structural match. + +### 3.2 Synced-Peers Stack — synchronized authoring surface + +> **Version scope: V1.1 (design treated as provisional pending V1.0 operator feedback).** The synchronized authoring surface (the user's "drag a follow-up over the fanned-out messages" intuition) lands in V1.1. The Stack-`+` affordance on Fan cards renders disabled in V1.0 (see [§2.4](#24-per-stack-affordances)). **V1.0 fallback for fan-through:** operators expand the Stack (`⊞`) and add a follow-up under each child individually, or wait for V1.1. The `addedToStack` field on `ConversationTreeNodeBase` is **not present** in the V1.0 type (§6.1 deferred to V1.1; revision 9 dropped the V1.0 reservation). +> +> **Why provisional:** the parent-walk peer detection, the params-deep-equality re-stacking rule, the divergence-decomposes-stack behavior, and the Promoted-state draft-placeholder semantics from [§3.3](#33-stack-semantics---three-operations-two-visual-states) are clever but have not been pressure-tested by real operators. The V1.0 release is the first time operators will use Fan-Children Stack at scale and form opinions about whether the synced-peers metaphor matches their workflow at all. **Revision 9 commits to revisiting the entire §3.2 design after V1.0 ships** — if operators don't actually want the fan-through pattern, or want something different (e.g., copy-the-edit-to-all instead of bidirectional sync), the V1.1 design changes accordingly. The detailed spec below is the leading candidate, not a frozen commitment. + +The user's "drag a follow-up over the fanned-out messages" intuition translates to: **a Stack accepts new children, and adding a child to a Stack adds it under each member, with the new descendants synced to each other.** + +``` + ┌─────────────────────────────────┐ + │ Fan: axis=attempt, n=10 │ + │ │ + │ ┌─────────────────────────┐ │ + │ │ Send ×10 │ │ ← Fan-Children Stack (§3.1) + │ │ "9 ✓, 1 ⚠" │ │ + │ └─────────────────────────┘ │ + │ │ + │ + ← stack `+` affordance: "add to all" + │ │ + └─────────────────────────────────┘ + + (click `+`, choose "Follow-up user message") + ┌─────────────────────────────────┐ + │ Fan: axis=attempt, n=10 │ + │ │ + │ ┌─────────────────────────┐ │ + │ │ Send ×10 │ │ + │ └─────────────────────────┘ │ + │ │ │ + │ ▼ │ + │ ┌─────────────────────────┐ │ + │ │ UserTurn ×10 (synced) │ │ ← Synced-Peers Stack: + │ │ "Now expand on point 3"│ │ all 10 share addedToStack=true, + │ └─────────────────────────┘ │ edit propagates to all + │ │ │ + │ ▼ │ + │ ┌─────────────────────────┐ │ + │ │ Send ×10 │ │ ← also Synced-Peers Stack + │ │ (draft, click refresh) │ │ (auto-inserted, also marked + │ └─────────────────────────┘ │ addedToStack=true) + └─────────────────────────────────┘ +``` + +Under the hood the conversation tree has **10 actual `UserTurnNode`s** (and 10 auto-inserted `SendNode`s) under the 10 fan-children Sends. Each carries `addedToStack=true`. The grouping is **not** recorded in a shared UUID — it is **derived** at render time by walking each candidate's `parentId` chain to the nearest `FanNode` ancestor and grouping those that share the same ancestor + depth-below. + +**Peer-detection rule (precise):** two nodes A and B are Synced-Peers Stack peers iff +1. `A.addedToStack === true` AND `B.addedToStack === true`, +2. The nearest `FanNode` ancestor of A equals the nearest `FanNode` ancestor of B (same node UUID), AND the number of edges from each up to that ancestor is equal, +3. `A.params` deeply equals `B.params` (divergence is implicit — no flag). + +All three keyed on data the conversation tree already has (`parentId`, `kind`, `params`). No new UUIDs, no synthetic signatures. + +### 3.3 Stack semantics - three operations, two visual states + +> **Version scope.** The two-state table below is the **V1.1 model** with draft-placeholder semantics. **V1.0 simplification:** with no Synced-Peers Stack (§3.2 is V1.1), the Promoted state collapses to "dim the non-promoted children; the Stack-`+` is disabled." No draft placeholders, no Stack-edit divergence, no Unpick-activates-placeholders. **V1.0 Pick = set `promotedChildSlotIndex`; visual dim. V1.0 Unpick = clear it; visual re-equalize.** That's it. The full table below is preserved for V1.1 implementers; V1.0 readers can mentally drop everything about Stack-`+`, draft placeholders, and Stack-edit-propagation. + +Stack operations apply to both Fan-Children and Synced-Peers stacks; they share UI affordances. Per Q.A.4: instead of detaching a picked member into its own card (which would shift the layout), **promotion is purely a visual state on the existing Stack**. The Stack card stays put; the promoted member gets full color + highlight border; the others dim to ~40% opacity. The `+` affordance stays anchored to the Stack and unambiguously means "add a child to this layer" (see §3.4 for the one-`+`-per-fan-layer gating rule). + +This collapses the previous revision's three-state model (synced / promoted-detached / frozen) into **two states** with one transition: + +| State | When | Visual | Stack `+` adds child to | Stack-edit targets | +|---|---|---|---|---| +| **Synced (default)** | No promotion set (`FanNode.params.promotedChildSlotIndex` is `null`) | All N peers rendered equally | All N peers (a new Synced-Peers Stack, `addedToStack=true` on each new node, all non-draft) | All N peers via parent-walk rule (§3.2) | +| **Promoted** | One peer set as promoted (`FanNode.params.promotedChildSlotIndex` is some slotIndex) | Promoted peer: full opacity + highlight border. Others: ~40% opacity, hover-readable, not editable, no new children added under them. | All N peers (`addedToStack=true` on each), BUT only the promoted peer's added node is non-draft; the other N-1 added nodes are `draft` placeholders that show as dimmed shadows in the expanded view. If the operator later Unpicks, the placeholders activate (transition to `edited`) so the Stack becomes a real Synced-Peers Stack across all N. | Promoted peer only | + +**Three operations:** + +1. **Stack-edit** - edit text or params on the Stack card. Under *Synced* this propagates to all peers (Synced-Peers Stack via parent-walk rule, §3.2). Under *Promoted* it targets only the promoted peer's path; the N-1 draft placeholders mirror the edit so that if the operator later Unpicks, the placeholders are ready to activate. +2. **Pick** - set `FanNode.params.promotedChildSlotIndex` to the clicked member's `slotIndex`. Instant visual transition to Promoted state; no layout shift; no tree restructuring; no execution change. Clicking a different member's "Pick" while already in Promoted state simply swaps the promotion; any draft placeholders inherited from the previous promotion remain dimmed under their new context. The cherry-pick analogue from the git mental model in §3.5. +3. **Unpick** - set `promotedChildSlotIndex` back to `null`. Returns to Synced. The N-1 placeholders activate (each is now a peer just like the originally-promoted one was). Useful when the operator decides "actually I want to keep exploring all 10 branches synchronously again". + +**Why N-symmetric peers in Promoted state instead of singletons?** Per Q.3.3 (revision 7): a singleton add in Promoted state followed by Unpick would leave an asymmetric tree (1 peer under one fan-child, 0 under the others), which the §3.4 predicate sees as un-stackable and decomposes into expanded per-card rendering. Symmetric N-peer adds with N-1 placeholders preserves the option to return to synced exploration without operator surprise. The placeholders consume no token cost (they don't refresh until activated) and the runner only dispatches `Send`s for non-draft nodes. + +**Promotion is per-FanNode.** If a tree has nested fans (Fan A with 10 children, child #4's subtree contains Fan B with 5 children), Fan A's promotion of child #4 does not affect Fan B. Fan B has its own independent `promotedChildSlotIndex`. The visual de-emphasis cascades (child #4's subtree renders at full opacity; #1-3, #5-10 and their entire subtrees render dimmed), but the *editing* model stays per-FanNode. + +**Pursuing two promotions in parallel** is not a primitive - it is a tree-clone operation via `branchToNewTree(treeRoot)` (§6.5 of primitives). Two trees, two tabs, two different `promotedChildSlotIndex` values. Operators flip between tabs to compare. + +### 3.4 Stack rendering predicates - both apply, independently + +**Fan-Children Stack** (§3.1) renders iff: +1. Parent is a `FanNode`. +2. All children have structurally identical subtrees (recursive shape and kinds match; `params` and execution may differ). +3. Operator has not explicitly clicked "Expand" on this Fan. + +**Synced-Peers Stack** (§3.2) renders iff: +1. Two or more nodes share the same nearest `FanNode` ancestor at the same depth below. +2. All of them have `addedToStack=true`. +3. All of their `params` are deeply equal (any divergence collapses the visual stack into per-card rendering for that layer; convergence later re-stacks). + +The two predicates are independent. A given canvas may show a Fan-Children Stack at the fan layer and a Synced-Peers Stack two layers below it (as in the §5.6 worked example). Decomposition of one does not force decomposition of the other. + +The Promoted state is **orthogonal** to both predicates: promotion does not break stack rendering. The stack with one promoted member is still rendered as a stack (the visual difference is opacity + border, not layout). + +### 3.4a Stack-`+` gating - one synced layer per fan, chain extends downward + +> **Version scope: V1.1.** This gating rule only applies once Synced-Peers Stacks exist; V1.0 has none, so the Stack-`+` affordance on Fan cards is uniformly disabled (see [§2.4](#24-per-stack-affordances)) and no gating logic is needed. The rule below describes V1.1 behavior. + +Per Q.3.4 (revision 7): the Stack-`+` affordance is **gated** so that each fan layer can host at most one synced-peer set. The rule disambiguates the affordance and eliminates the "two batches merge into one stack" surprise from earlier revisions. + +**Stack-`+` on a Fan card** (the affordance that begins a new synced chain) is shown iff no `addedToStack=true` node has this Fan as its nearest-Fan ancestor at depth-below=2. In plain words: a Fan offers Stack-`+` until the operator clicks it once. After that, the chain extends downward from the new Synced-Peers Stack, not from the Fan. + +**Stack-`+` on a Synced-Peers Stack card** (the affordance that extends an existing synced chain) is **always shown**. The new peers it creates inherit the same nearest-Fan ancestor + a deeper depth-below, so they form their own layer and don't collide with anything above. + +Visually: + +``` +Fan(attempt, n=5) + ┌───────────────────────────────┐ + │ [Send ×5] │ + │ │ │ + │ + ← Stack-+ available │ (first add at this depth) + │ ↓ │ + │ [UserTurn ×5 "Why?"] │ ← addedToStack=true + │ │ │ + │ (no +) ← Stack-+ DISABLED │ (fan layer already has a synced layer) + │ │ + └───────────────────────────────┘ + + │ (the chain extends here, from the synced-peers stack) + ▼ + ┌───────────────────────────────┐ + │ [UserTurn ×5 "Why?"] │ + │ + ← Stack-+ available │ (extend the chain downward) + │ ↓ │ + │ [Send ×5 (draft)] │ + └───────────────────────────────┘ +``` + +**Edge cases:** +- Operator deletes the synced-peer layer entirely → Stack-`+` on the Fan re-enables (predicate true again). +- Operator diverges one peer (per-edit) so the synced layer visually decomposes → Stack-`+` on the Fan stays disabled. Divergence is a render state, not a data-model state; the peers still exist with `addedToStack=true`. +- Nested fans (Fan A at depth 0, Fan B at depth 4 inside one of A's branches) → Fan A's Stack-`+` is gated on A's depth-below=2; Fan B's is gated on B's depth-below=2. Independent gates. + +**Implementation cost:** one tree-walk predicate check per fan render. Bounded by fan-children count. Cheap. + +**What this means for the operator:** if they want "two different follow-ups in parallel under all 5 attempts," they either (a) edit one of the existing synced UserTurns into a fan itself (`Fan(axis='prompt', variants=[A, B])`), or (b) clone the whole tree and try the second follow-up in the clone. Both are more honest about what they're doing than two competing synced layers at the same fan depth. + +### 3.5 Git mental model + +The primitives doc has the full table in [01_tree_primitives.md §6.8](01_tree_primitives.md#68-git-mental-model-for-operator-vocabulary); this section is the affordances-doc summary an operator might read first. + +The whole tree-view design lines up surprisingly well with git, and **operator vocabulary in the UI uses git verbs**: + +- A tree node's `execution` is its current **commit** (the most recent `ExecutionRecord`). Its `executionHistory` is the **reflog**. +- Editing a node and then clicking the canvas-level "Refresh tree" button performs what git calls a **rebase** — downstream nodes that became stale rebuild on top of the new upstream. +- The "Pick" operation on a Stack is **cherry-pick**: choose one of N runs as the canonical commit on this ref. +- Branching from a node is `git branch new-branch ` — a cheap copy of refs, no commits duplicated. +- `branchToNewTree(root)` is "Clone tree"; `branchToNewTree(anyOtherNode)` is "Branch from here". One function, two labels (§6.5 of primitives). The V1.1 `branchToSubtree(nodeId)` ships under a separate `🌿` affordance with sibling-subtree landing. +- Selecting a past run from a node's reflog enters **detached HEAD** rendering (dotted border, banner); re-running while detached creates a fresh tip and exits detached state. + +**Two places the analogy is loose** (operators should know): + +- A git branch has one tip; our conversation tree has many tips (one per leaf Send). So "tree = branch" is more like "tree = a workspace containing one or more git-like ref chains". +- Git rebase is destructive (old commits become unreachable from any ref). Our refresh is **non-destructive** — old `ExecutionRecord`s stay in each node's reflog (capped at `REFLOG_CAP_PER_NODE`, default 50, configurable per-Workspace; see [01 §6.6](01_tree_primitives.md#66-executionhistory-gc-the-reflog)), and the underlying backend `AttackResult`s remain queryable in the History tab filtered by `conversation_tree_id` regardless of tree-side state. + +The data model keeps its existing names (`conversation_tree_id`, `ExecutionRecord`, `executionHistory`, `branchToNewTree` / V1.1 `branchToSubtree`). Primary UI button labels match the API verbs (`Refresh node` / `Refresh subtree` / `Refresh tree`). Git terminology surfaces for execution-history concepts only — `Reflog` / `Past runs` tab title, `Cherry-pick` Stack action, `Checkout this run` for inspecting past runs, `Make current` for promoting from the reflog, `Clone tree` / `Branch from here` for `branchToNewTree`. + +--- + +## 4. Layout + +### 4.1 Goals + +In rough priority order: + +1. **No overlap.** Hard constraint. +2. **Determinism.** Same tree → same coordinates. Operator muscle memory is real. +3. **Tightness.** Use horizontal space efficiently; wide trees should not be 4× wider than necessary. +4. **Stable under edit.** Adding/removing one node should shift the rest of the tree as little as possible — operator focus stays where it was. This is a layout-engine pick + an animation policy (§4.6). +5. **Main path is visually obvious.** When a leaf is pinned (§2.2 SendNode `★`), the root→leaf chain renders as a perfectly straight vertical spine. **V1.1** — main-path pinning is deferred from V1.0 (the `★` affordance is not rendered in V1.0; see §2.2 and §4.3 below). + +### 4.2 Algorithm comparison + +| Algorithm | Time | Tightness | Equal-subtree symmetry | Stability under edit | Notes | +|---|---|---|---|---|---| +| **Naïve DFS width-summing** (what §8.2 of primitives proposes) | O(n) | Loose (always equal to sum of widths) | Yes | OK | The 50-LOC option. Wastes horizontal space when subtrees are very different sizes | +| **Reingold–Tilford** | O(n²) | Tight (subtree contours interleave) | Yes | OK | The textbook "tidy tree". Quadratic in the worst case | +| **Buchheim–Walker** | O(n) | Same as Reingold–Tilford | Yes | OK | Reingold–Tilford done in linear time. The standard for "tidy trees" today. This is what `d3-hierarchy.tree()` actually implements | +| **Force-directed** (d3-force) | O(n²) per iter | Variable | No (re-runs converge differently) | Bad — every edit re-jostles the whole graph | Wrong shape for our tree; reject | +| **Sugiyama** (dagre) | O(n²) typical | Good | No (DAG-oriented) | OK | Designed for DAGs; overkill for our tree | +| **Manual / grid** | — | — | — | — | Operator-positioned; doesn't scale to fan-outs; reject | + +### 4.3 Recommendation: Buchheim–Walker + pinned main path + adaptive collapse + +> **Version scope.** **V1.0 ships plain `d3-hierarchy.tree()`** — layer 2 below (Buchheim–Walker over the whole tree). The Stack-collapse logic (layer 3) ships in V1.0 for Fan-Children Stack only. **Main-path pinning (layer 1) is V1.1**, when the `★` Pin affordance (§2.2 SendNode rail) is enabled. The three-layer design is preserved here for V1.1 implementers; V1.0 readers can mentally skip layer 1. + +Three layers, applied in order: + +1. **(V1.1) Identify the main path** (if any leaf is pinned). The main path is the unique root→pinned-leaf chain. Pin every main-path node's x-coordinate to a fixed centerline. +2. **(V1.0) Buchheim–Walker for the rest.** In V1.0, applied to the entire tree (no main path). In V1.1, applied to each off-main subtree with the main-path-side contour treated as a wall. +3. **(V1.0) Render-time stack collapse.** Nodes identified as Fan-Children Stack peers by the predicates in §3.1 are folded into a single Stack card. (Synced-Peers Stack collapse is V1.1 per §3.2.) + +The V1.0 layout call simplifies to: + +```ts +function layout(tree: ConversationTree): Map { + // V1.0: plain Buchheim–Walker on the whole tree + return buchheimWalker(tree.root, /* side */ 'center') +} +``` + +The full V1.1 algorithm: + +```ts +function layout(tree: ConversationTree): Map { + const positions = new Map() + const mainPath = computeMainPath(tree) // V1.1: root → pinned leaf, or empty + + // 1. (V1.1) Lay out main-path nodes on the centerline + let y = 0 + for (const node of mainPath) { + positions.set(node.id, { x: 0, y }) + y += VERTICAL_SPACING + } + + // 2. For every branching point on the main path, lay out the off-main subtree + for (const branchPoint of mainPath) { + for (const child of branchPoint.children) { + if (mainPath.includes(child)) continue + const subtreeRoot = child + const isLeftOfCenter = chooseSide(branchPoint) // alternates / packs tightly + const offset = buchheimWalker(subtreeRoot, isLeftOfCenter) + for (const [nodeId, point] of offset) { + positions.set(nodeId, point) + } + } + } + + // 3. (V1.0) If no main path is pinned, fall back to plain B–W on the whole tree + if (mainPath.length === 0) { + return buchheimWalker(tree.root, /* side */ 'center') + } + + return positions +} +``` + +**Why this beats the §8.2 naïve DFS:** the naïve approach reserves `Σwidth(children)` for every parent. Reingold–Tilford-style algorithms let small subtrees nestle into the gaps of large ones, often halving total width. For our use case where fan-outs frequently produce wide subtrees next to narrow chains, the tightness win is substantial. + +**Library choice:** + +- For the **layout primitive itself**, use `d3-hierarchy`'s `tree()` function — ~10 KB, well-tested, exactly the Reingold–Tilford-flavored "tidy tree" we need. We DO NOT pull in the rest of `d3` — `d3-hierarchy` is a standalone package. +- For the **main-path constraint and stack-collapse logic**, write our own ~80 LOC on top of `d3-hierarchy` output. + +This is a small upgrade from the §8.2 recommendation (which was "custom DFS, deterministic, ~50 LOC, dagre as fallback"). The honest reason to upgrade: the user has now explicitly raised the question of how to avoid horizontal sprawl, and B–W is the textbook answer to exactly that. §8.2 of `01_tree_primitives.md` should be updated to reflect this. + +### 4.4 Edge routing + +Three options, with a clear winner: + +| Style | When it's good | When it's bad | +|---|---|---| +| **Straight lines** | Few nodes, short distances | Crosses other nodes in dense trees | +| **Bezier curves** (react-flow default) | Looks nice; few crossings | Hard to follow at scale; ambiguous origin handle | +| **Orthogonal / "Manhattan"** | Mirrors org-chart conventions; obvious parent-child relationships; no crossings if layout is right | Stiff-looking; needs corner-routing logic | + +**Recommendation: Orthogonal.** Tree layouts look like org charts; org charts use orthogonal routing for a reason — operators read them top-down and following a right-angle path is unambiguous. React-flow exposes `type: 'smoothstep'` which gives rounded orthogonal corners and is the standard choice for tree-like diagrams. + +### 4.5 Animation policy on layout shifts + +When a node is added/removed/moved, the rest of the tree may shift. We don't want a 200 ms "everything jumps" effect. + +Policy: + +- **Position changes < 4 px**: instant, no animation (avoids "twitch"). +- **Position changes 4–100 px**: animate with a 200 ms `ease-out`. +- **Position changes > 100 px** (operator added a big subtree off-screen): pan the viewport to *follow* the affected subtree's centroid instead of animating the layout shift in place. Operator focus stays anchored. +- **Stack-collapse / expand transitions**: 250 ms, scale + opacity. The stack card "expands into" the per-child cards. + +Use `framer-motion`'s `layout` animations if we want to take advantage of FLIP transitions; otherwise raw CSS transitions are fine and lighter (~0 bundle cost vs. ~50 KB). + +### 4.6 Stack collapse policy at different zoom levels + +Adaptive: as the operator zooms out, Stacks aggregate more aggressively. + +| Zoom | Stack rendering | +|---|---| +| ≥ 100% | Stack shows: card + multiplicity + 3 most-recent execution summaries | +| 50–100% | Stack shows: card + multiplicity + aggregate status (e.g., "9 ✓, 1 ⚠") | +| < 50% | Stack shows: dot + multiplicity badge | +| < 25% | Whole subtrees beyond depth 2 collapse into a single "+N subtree" indicator | + +Lazy expansion (operator click) overrides the zoom rule. + +--- + +## 5. Scenario Walkthroughs + +Eighteen scenarios. Each: **goal → action sequence → before/after sketch → verdict (✓ design handles / ⚠ gap / 🛠 needs work)**. + +State suffix legend: `✓` clean, `↻` stale, `●` running, `⚠` failed, `◯` draft, `🔒` operator-locked. + +### Scenario → version map + +The full design surface is documented below. The V1.0 release covers the scenarios that touch only V1.0-shipped primitives. + +| Scenario | Version | V1.0 fallback if V1.1 | +|---|---|---| +| 5.1 Greenfield: first send | V1.0 | — | +| 5.2 Continue the conversation | V1.0 | — | +| 5.3 Re-roll the last response | V1.0 | — | +| 5.4 "Try this prompt 10 times" (attempt fan) | V1.0 | — | +| 5.5 Pick one of 10 to continue | V1.0 | Per §3.3 V1.0 note: visual dim only, no draft-placeholder dance | +| 5.6 Fan-through (synced follow-up to all branches) | **V1.1** | Operator expands the Stack and types the follow-up under each child individually | +| 5.7 Try 3 different converters on the same prompt | V1.0 | — | +| 5.8 Sweep across 3 targets | **V1.1** | Operator manually clones the tree (via `📋` Clone tree, which now ships V1.0) per target, editing the target on each clone's root prompt | +| 5.9 Edit upstream: visual propagation | V1.0 | — | +| 5.10 Refresh subtree | V1.0 | — | +| 5.11 Branch from a node | **V1.0** | Ships via the always-new-tree variant of `branchFromNode` (Patch #1, revision 9). V1.0 lands by swapping the active tree; V1.1 lands as a new tab in the strip. | +| 5.12 Open a historical attack (auto-reverse) | V1.0 (linear+converter) | The V1.1 fanout-detection mapping is the only gap; V1.0 shows the linear chain with converter pipelines, no implicit FanNodes | +| 5.13 Operator-locked branch | V1.0 | — | +| 5.14 Partial failure mid-refresh | V1.0 | — | +| 5.15 Drill into linear view | V1.0 | — | +| 5.16 Delete a branch | V1.0 | — | +| 5.17 Edit an early node in a large tree | V1.0 | — | +| 5.18 Browse refresh waves across the whole workspace | **V1.0** (depends only on `wave_id` labels which ship V1.0; the V1.x History-tab "Group by wave" toggle is the implementation surface) | — | + +### 5.1 Greenfield: first send + +**Goal:** Operator wants to send a single prompt. + +**Actions:** +1. Click `+ New tree` in the empty canvas. +2. RootPromptNode appears, focused. Operator types text + picks target. +3. Operator clicks `Send` button on the RootPromptNode card (or presses Enter). +4. A `SendNode` is auto-inserted as the RootPrompt's child; runner fires; node transitions `draft → running → clean`. + +``` +Before: After click: After send: +(empty canvas) [RootPrompt: "Hi"]◯ [RootPrompt: "Hi"]✓ + │ + ▼ + [Send → "Hi there!"]✓ +``` + +**Verdict:** ✓ Handled. + +### 5.2 Continue the conversation + +**Goal:** Operator wants to add a follow-up user message after seeing the response. + +**Actions:** +1. Hover the edge below the `Send` node. `+` chip appears. +2. Click `+`. Popover shows "Follow-up user message" as the first option. Click it. +3. New `UserTurnNode` appears below `Send`, focused, empty. +4. Operator types text, presses Enter. +5. A new `SendNode` auto-inserts under the new `UserTurnNode`. Runner fires. + +``` +[RootPrompt: "Hi"]✓ [RootPrompt: "Hi"]✓ + │ │ + ▼ ▼ +[Send → "Hi there!"]✓ → [Send → "Hi there!"]✓ + │ │ + + ← hover ▼ + [UserTurn: "How are you?"]◯ + │ + ▼ + [Send]● +``` + +**Verdict:** ✓ Handled. Edge-affordance + auto-Send insertion makes this 2 clicks. + +### 5.3 Re-roll the last response + +**Goal:** "I didn't like that answer, try again." + +**Actions:** Click `↻` on the `SendNode`. + +**UI shows:** Node briefly enters `●` state. Old `ExecutionRecord` moves into `executionHistory` (visible in the right-side drawer with a "Compare" toggle). New `ExecutionRecord` lands as `clean`. **Tree shape unchanged.** + +**Verdict:** ✓ Handled. + +### 5.4 "Try this prompt 10 times" (attempt fan from a fresh Send) + +**Goal:** Sweep N attempts on the same prompt. + +**Action A (operator knows up-front):** +1. After typing the prompt and before clicking Send, click `↻×N` on the RootPrompt's pending Send affordance. Picker appears: "How many attempts? [10]". +2. Click OK. A `FanNode(axis='attempt', n=10)` is created with 10 `SendNode` children, rendered as a Stack. + +**Action B (operator decides after first response):** +1. After seeing the response, click `↻×N` on the existing `SendNode`. Picker: "Total attempts including this one? [10]". +2. The existing `SendNode` is **wrapped**: a new `FanNode(axis='attempt')` is inserted as the SendNode's parent, the existing SendNode becomes variant #0, 9 new draft SendNodes are added as variants #1–9. + +``` +Before (Action B): After: +[Send → "X is ..."]✓ ┌─────────────────────────────┐ + │ Fan: axis=attempt, n=10 │ + │ ┌──────────────────────┐ │ + │ │ Send ×10 │ │ + │ │ (1 ✓, 9 ◯) ▶ refresh│ │ + │ └──────────────────────┘ │ + └─────────────────────────────┘ +``` + +**Verdict:** ✓ Handled. The promote-existing-Send-to-fan mechanic preserves the operator's first execution as variant #0 rather than re-running. + +### 5.5 Pick one of 10 to continue (the stacked-response operation) + +**Goal:** Operator ran 10 attempts; wants to continue the conversation from response #4. + +**Actions:** +1. Click `⊞` on the Stack card to expand. 10 per-child SendNode cards appear in a tight horizontal row. +2. Operator clicks each card to read responses (right-side drawer shows the assistant text). +3. Operator clicks `🎯 Pick one` on card #4. Confirmation: "Promote #4 and freeze the other 9?". +4. (Under the revised model, no field changes: `FanNode.params.promotedChildSlotIndex=4` is set. Cards #1-3, #5-10 dim to ~40% opacity; card #4 stays full opacity with a highlight border. No layout shift.) +5. Card #4 now has a normal `+` edge-affordance below it; operator inserts a follow-up. + +``` +After Pick: +[Fan: axis=attempt, n=10] + │ + ├──── [Stack: 9 frozen attempts] 🔒 (cannot be edited; preserved for history) + │ + └──── [Send #4 → "X is best understood as..."]✓ + │ + + ← operator continues from here +``` + +**Verdict:** ✓ Handled. This is the cleanest UX for the "stacked response with selectable propagation" the user described. + +### 5.6 Fan-through: follow-up that applies to all branches + +**Goal:** "I want to send these 10 attempts, then ask 'what assumptions are you making?' to ALL of them." + +**Actions:** +1. Operator has a Stack with 10 attempts in **Synced state** (`promotedChildSlotIndex = null`). +2. Operator clicks `+` at the bottom of the Stack card (the per-stack `+` affordance from §2.4). +3. Popover: "Add follow-up to all 10 branches". Operator picks "Follow-up user message". +4. A `UserTurn ×10 (synced)` card appears inside the Stack's bounding box, with one shared text editor. +5. Operator types "What assumptions are you making?" once. Each of the 10 underlying `UserTurnNode`s is created with `addedToStack=true` and identical `params.text`; the parent-walk peer rule (§3.2) groups them, and edits to the Stack card propagate to all 10. +6. A `Send ×10` card auto-inserts below. Operator clicks the Stack's `↻` ("Refresh children") to run. + +``` +[Fan: axis=attempt, n=10] (Synced — no promotion) + ┌────────────────────────────────────────┐ + │ [Send ×10] "10 ✓" │ + │ │ │ + │ ▼ │ + │ [UserTurn ×10 (synced)] │ + │ "What assumptions are you making?" │ + │ │ │ + │ ▼ │ + │ [Send ×10] "10 ✓" │ + └────────────────────────────────────────┘ +``` + +If the operator later **Picks** one (say #3), the visual changes but the structure does not: #3's path stays at full opacity, all other peers dim. New `+` clicks then add only under #3. + +If the operator wants to **diverge** branch #3 from the synced UserTurn text without picking ("on this one, ask something different"): + +7. Operator clicks `⊞` to expand the inner Stack, then clicks the per-child `+` (grey-on-card, distinguishable from the Stack's blue `+` per §2.4) on branch #3's UserTurn for a one-off edit — OR uses the "Unstack" affordance to disband the sync entirely. +8. Branch #3 becomes individually editable. Its `params.text` now differs from the other 9, so the §3.2 peer rule no longer groups it with them; the Stack visually decomposes at this layer. Branches 1, 2, 4-10 still match each other's `params` and remain rendered as a smaller Synced-Peers Stack with 9 peers. If the operator later restores #3's text to match the others, the Stack re-forms at full size (implicit re-stacking via params convergence). + +**Verdict:** ✓ Handled. The `+`-on-Stack vs. `+`-on-child distinction is the same color/style rule used in §2.4. + +### 5.7 Try 3 different converters on the same prompt + +**Goal:** Sweep ROT13 / Base64 / NoOp. + +**Actions:** +1. After typing the prompt (or selecting an existing UserTurnNode), click the `🔀` (wrap-in-fan) affordance on the node's rail. +2. Picker: "Fan axis: [prompt / converter / target / system_prompt / attempt]". Pick "converter". +3. Modal: "Variants" with an Add chip. Operator adds ROT13, Base64, NoOp. +4. Tree shape changes: UserTurnNode is wrapped in a `FanNode(axis='converter')` with 3 child UserTurnNodes, each carrying one converter in its pipeline. SendNodes under each. + +**Verdict:** ✓ Handled. + +### 5.8 Sweep across 3 targets + +**Goal:** Same prompt, three models. + +**Actions:** Same as §5.7 with axis = `target`. Each child is a SendNode (no UserTurn variant needed; the prompt is identical). + +``` +[RootPrompt: "Explain photosynthesis"]✓ + │ + ▼ +[Fan: axis=target, variants=[gpt-4o, claude-3.5, llama-3]] + │ + ▼ (3 branches) + [Send→gpt-4o]✓ [Send→claude-3.5]✓ [Send→llama-3]✓ + AR_1 AR_2 AR_3 +``` + +Per §7.2 of primitives, 3 ARs because target changes. Per §9.2 of primitives, this is no longer a special case under AR-per-leaf. + +**Verdict:** ✓ Handled. The Fan card displays "spawns 3 AttackResults" hint. + +### 5.9 Edit upstream: visual propagation + +**Goal:** Operator changes the root prompt and wants to see what becomes stale. + +**Actions:** +1. Operator clicks the root `RootPromptNode`'s `✏` button, edits text, blurs. +2. Root state: `clean → edited`. +3. **All descendants** transition `clean → stale`. Visually: their cards get a yellow border + a small `↻` overlay icon. Edge animation: a faint pulse travels down each edge for 400 ms to draw the eye. +4. The canvas-level ribbon shows "1 edited, 14 stale" with a `Refresh tree` button. + +**Verdict:** ✓ Handled. The visual pulse is a "show, don't tell" cue that propagation happened. + +### 5.10 Refresh subtree + +**Goal:** Operator only wants to re-run one branch, not the whole tree. In git terms: rebase a subtree onto its updated upstream. + +**Actions:** +1. Right-click on the branch's root node → context menu → "Refresh subtree" (or shift-click the node's `↻`). +2. Runner walks down with `maxParallel=4` (per-Workspace; §12.2 of primitives). Each affected node animates `stale/edited → running → clean/failed`. +3. Previous executions per node move into reflog (§6.6 of primitives), evicting oldest if over the configurable cap (default `REFLOG_CAP_PER_NODE = 50`); eviction surfaces a ribbon marker per §2.3. + +**Verdict:** ✓ Handled. + +### 5.11 Branch from a node - the "this prompt didn't work, let me try another angle" motion + +**Goal:** Operator is mid-conversation. The most recent prompt didn't land well — they want to **edit that prompt and re-run** to see a different outcome, while **preserving the original run** so they can compare or come back. + +**Actions (V1.0 — minimal Workspace swap variant):** +1. Operator clicks the `📋` icon on the UserTurn whose text they want to rewrite. Tooltip reads "Branch from here" (because the node is not the root). +2. **The canvas swaps to a new ConversationTree** (V1.0; V1.1 opens a new tab — see §13.1 vs §13.3 of primitives). The source tree's id is pushed onto `recentTreeIds` and a toast appears: *"Branched from . Source tree saved to History (use Switch tree or History → Open as tree to return)."* +3. The new tree contains a deep copy of the root-to-this-node path **plus this node's descendants**. Siblings of any node on the path are not carried over. All cloned nodes initially share `ExecutionRecord` refs with the source — no token cost, no backend calls. +4. The cloned UserTurn is focused with its text editor open. Operator edits the text and presses Enter. The edited node goes `edited`; its descendants go `stale`. Runner kicks off a wave on the cloned subtree under the new tree's fresh `conversation_tree_id`. The original tree is **never touched** (its backend ARs are untouched; only this canvas swapped away from it). +5. Operator can return to the source via: + - **Switch tree** button in the canvas-level ribbon (§2.3) — picks from `recentTreeIds`. + - **History tab → Open as tree** (the §9.4.1 reload-reconstruction path; restores the source with all completed leaves). + - **Second browser tab** for true side-by-side comparison (the §9.4.3 `BroadcastChannel` advisory lock keeps the two tabs from racing the runner). + +**Actions (V1.1 — full tab strip):** identical except step 2 opens a new tab in the strip instead of swapping; the operator flips between source and clone via tabs without going through "Switch tree" or History. + +``` +Original tree: New tree (after edit + refresh): +R --- A R' --- X' (edited) + \- X --- B \- B' (refreshed, new AR) + \- C \- C' (refreshed, new AR) +``` + +**The whole-tree case ("I want both attempt #4 AND attempt #7"):** click `📋` on the root node. Tooltip reads "Clone tree" instead of "Branch from here" because the source slice is the entire tree. Mechanically identical — it's `branchToNewTree(root)`. V1.0: clone swaps the canvas, operator flips via Switch tree / second browser tab; V1.1: both trees show in the tab strip, the operator sets a different `promotedChildSlotIndex` in each. + +**Verdict:** ✓ Handled. One affordance (`📋`), one primitive (`branchFromNode`), two contextual labels. The user's "edit this prompt and propagate to see the outcome — but the old one stays immutable" motion is the design intent. V1.0 ships the data-model and primitive; V1.1 ships the tab-strip ergonomics. + +### 5.12 Open a historical attack (auto-reverse) + +**Goal:** Operator opens a 12-turn attack from the History tab. + +**Actions:** +1. From History tab, click "Open as tree" on an AttackResult row. The frontend calls [01 §13.1 `openTreeFromAttackResult(attackResultId)`](01_tree_primitives.md#131-v10-minimal-workspace). +2. Per §9.3 of primitives, the runner walks the conversation's messages and synthesizes tree nodes: + - 12 `UserTurn`+`Send` pairs in a linear chain (V1.0). + - **(V1.1)** If multiple leaf ARs share a `conversation_tree_id` and converge at a common lineage root via `original_prompt_id` (per §9.3.1 of primitives — the O(1) hash-bucket group-by; `wave_id` disambiguates fan members vs. separate explorations), an implicit `FanNode(axis='prompt')` is inserted at the divergence point. +3. Tree renders. Synthesized nodes get a "reconstructed" badge (V1.0); reconstructed fans additionally get a "reconstructed from history" badge (V1.1). + +**`conversation_tree_id` id-minting (V1.0).** `openTreeFromAttackResult` inspects the source AR's `labels.conversation_tree_id`: +- **V1.0+ AR** (label present): delegates to `openTree(treeId)`; URL fragment reflects the existing id; reload-reconstruction follows the standard §9.4.1 path. +- **Pre-V1.0 AR** (label absent): frontend mints a fresh `ConversationTreeId` via `crypto.randomUUID()` and stores `ConversationTree.parentSourceConversationId = ar.conversation_id` (also mirrored to sessionStorage at `pyrit.workspace.parentSourceConversationId.`). URL fragment immediately reflects the new tree id. **Until the first Refresh fires, no backend write has happened** — the minted id is operator-local. Reload of an unrefreshed minted tree uses the §9.4.1 pre-V1.0 fallback path: labels-query returns no rows, sessionStorage lookup returns the legacy `conversation_id`, hydration falls through to `GET /api/attacks?conversation_id=Y`. The first Refresh fires `create_attack + N add_message` with the minted id in `labels.conversation_tree_id`; the resulting per-leaf AR rows in History are the first persisted references to the new tree, and the legacy AR keeps its own `conversation_id` (no label rewrite — see [03 §12 Q.H.1](03_runner.md#12-open-questions) for the label-inheritance choice). + +``` +After auto-reverse of a 12-turn linear AR: +[ImportMessage: AR_xxx]✓ + │ + ▼ +[UserTurn #1]✓ (reconstructed) + │ + ▼ +[Send #1]✓ → AR_xxx (this AR) + │ + ▼ +... 11 more pairs ... +``` + +The operator can now edit any node and refresh — re-execution spawns new ARs under a fresh `conversation_tree_id`. + +**Verdict:** ✓ Handled. The "reconstructed" badges set expectations that the conversation tree structure is inferred, not authored. + +### 5.13 Operator-locked branch + +**Goal:** Operator opens a colleague's attack. + +**Actions:** +1. Open in tree view (5.12). +2. Per §9.1 of primitives, every reconstructed node from someone else's AR renders with a 🔒 badge. +3. All mutating affordances (`✏`, `↻`, `+`, `🗑`, `🔀`) are disabled and grey, with tooltips: "Owned by alice — snapshot to continue". +4. Only `📋 Snapshot` and `🔍 Open in linear view` are enabled. + +**Verdict:** ✓ Handled — but only the visual lock; per §9.1 the runner must also catch the backend 400 if the operator somehow bypasses the visual guard (e.g., via keyboard shortcut). + +### 5.14 Partial failure mid-refresh + +**Goal:** Operator clicks "Refresh tree", 3 of 15 leaves fail (rate limit / target down). + +**Actions:** +1. Subtree refresh starts. Nodes go `●` in waves. +2. As completions come back: 12 transition to `✓`, 3 transition to `⚠ failed`. The [03 §5.3](03_runner.md#53-cascade-on-failure) in-flight cascade drops any sibling leaves sharing a failed ancestor from `ready` and marks them `⦾ blocked` (distinct from `⚠ failed` — a blocked leaf never dispatched). +3. The 12 are `clean`; the 3's descendants (if any) remain `stale` because they have no input. +4. Top-of-canvas toast: "Refresh complete: 12 succeeded, 3 failed, 0 rate-limited, 0 blocked, 0 needs-fix, 0 cancelled. [Retry failed]". The five non-success buckets are spelled out in §2.3 above (rev 18, per rubber-duck Finding B.3 — `needs-fix` surfaces `failure_class='permanent'` distinctly so operators understand which leaves [Retry failed] excludes by design). The [Retry failed] button captures wave-W's failed-leaf ids + blocked-leaf ids at this completion event and calls [`runner.retryFailedNodes(treeId, nodeIds)`](../../../doc/gui/design/03_runner.md#21-entry-points-the-public-api) on click — scoped to wave-W's victims, not the whole tree. Rate-limited and needs-fix leaves are excluded from `nodeIds` (operator must wait + click Refresh tree manually for rate-limited; must edit the underlying request for needs-fix). When *all* failures are rate-limited, [Retry failed] is disabled with tooltip *"N leaves were rate-limited. Wait for the target's rate-limit window to clear, then click Refresh tree to retry."* +5. Failed nodes show a small `⚠` chip with hover-tooltip showing the error message. + +**Verdict:** ✓ Handled per §6.4 of primitives. + +### 5.15 Drill into linear view + +**Goal:** Operator wants to read a full conversation in the familiar chat UI for one leaf. + +**Actions:** +1. Click `🔍` on a leaf SendNode (or just click the node and use the keyboard shortcut `L`). +2. Right pane slides in showing the existing `MessageList` + `ChatInputArea` ([ChatWindow.tsx](../../../frontend/src/components/Chat/ChatWindow.tsx)) loaded with the leaf's `AttackResult` and conversation. +3. The tree view in the left pane stays interactive — the operator can switch to other leaves and the right pane follows. +4. Sending a message in the linear view's input box: under the hood, this is a new `UserTurnNode + SendNode` child appended to the leaf in the tree. The tree updates immediately. + +**Verdict:** ✓ Handled. The "follow-up animation" between graph and linear views from §10.2 of primitives is the polish item. + +### 5.16 Delete a branch + +**Goal:** "I don't need this experimental branch anymore." + +**Actions:** +1. Operator clicks `🗑` on the subtree's root. +2. Confirmation: "Delete 7 tree nodes? Their 4 AttackResults will remain in History (filter by conversation_tree_id to find them)." +3. Operator confirms. The subtree disappears from the canvas. +4. Backend state untouched (append-only). + +**Verdict:** ✓ Handled. The confirmation language tells the operator exactly what is and isn't deleted. + +### 5.17 Edit an early node in a large tree — see what the refresh produced + +**Goal:** Operator has a 60-leaf tree (per Appendix A in primitives). They edit the root prompt and want to understand the resulting refresh wave digestibly. This is the §10 walkthrough in scenario form. + +**Actions:** +1. Operator clicks the root `RootPromptNode`'s `✏` button, edits text, blurs. Root → `edited`; 60 descendants → `stale`. Yellow borders propagate. Canvas-top ribbon reads "1 edited, 60 stale". +2. Operator clicks the ribbon's "Refresh tree" button. +3. **Preview banner** has already shown: *"Refresh 60 leaves? Estimated 60 target calls. [Refresh] [Cancel]"*. Since 60 > the default `confirmThresholdCount = 20`, a **confirmation modal** intercepts the click before any backend call goes out (§8.1). Operator confirms. +4. Operator confirms. Runner stamps a fresh `waveId = abc123` and walks the tree with `maxParallel=4` (per-Workspace; §12.2 of primitives). Affected nodes pulse `stale → running → clean`. (Failed nodes pulse `running → failed`.) +5. **Wave completion toast** lands at the bottom-right: "*Wave complete: 57 ✓, 3 ⚠. [View wave]*". +6. Operator clicks "View wave". The right-side drawer opens to the "Recent waves" tab with `abc123` selected; the canvas dims everything except the nodes touched by this wave; the drawer shows: + - Trigger: `RootPromptNode` (with "Jump to node" link) + - 60 leaves affected: 57 succeeded, 3 failed, 0 cancelled + - Per-leaf list with status + 80-char output preview + - "Compare to previous wave" button (V2; greyed in V1) + +**Verdict:** ✓ Handled in V1 by the toast + drawer panel. Tree-local diff view is V2. + +### 5.18 Browse refresh waves across the whole workspace + +**Goal:** Operator has three worktrees open and wants to see what's been happening across all of them in the last hour. This is the cross-tree wave story. + +**Actions:** +1. Operator switches to the existing **History** tab (sidebar, alongside `'tree'`, `'chat'`, `'config'`). +2. The History tab's existing filter chips (operator, operation, attack type, outcome) gain a new chip: **"Group by wave"** (toggle). +3. Operator toggles it on. AR rows collapse into wave-group rows. Each wave-group row shows: `wave_id` short suffix · timestamp · trigger ConversationTree/node · "60 ARs (57 ✓, 3 ⚠)" · expand chevron. +4. Operator expands the most recent wave. The 60 ARs are listed underneath, each clickable for its individual conversation. +5. Operator clicks "Open in tree". The originating ConversationTree opens (or focuses, if already open) in the `'tree'` tab with the wave-filter pre-applied (matches scenario §5.17 step 6 from the History side). + +**Verdict:** ✓ Handled in V1.x once the History tab gains the `wave_id` group toggle (~30 LOC). The History tab already accepts the `?label=wave_id:X` filter via its existing labels filter ([HistoryFilters.tsx](../../../frontend/src/components/History/historyFilters.ts) — exact reference resolved at implementation). + +--- + +## 6. Affordances → Primitives Delta + +Two small additions to `01_tree_primitives.md` are needed to make the Stack and the Promoted state work cleanly. Everything else in this doc is pure UX over the existing primitives. + +### 6.1 `addedToStack` on `ConversationTreeNodeBase` (V1.1) + +> **Version scope: V1.1 only.** Revision 8 reserved `addedToStack` on the V1.0 type "so V1.1 doesn't need a schema migration." **Revision 9 drops the V1.0 reservation** — the field has zero V1.0 readers or writers, so its presence on the V1.0 type is dead code and a "what is this?" tax on every V1.0 reader. +> +> **V1.0 → V1.1 migration: TypeScript-structural extension with explicit `false` default at the read site.** The V1.1 PR adds `addedToStack: boolean` to `ConversationTreeNodeBase`. The V1.1 reader code paths (Synced-Peers Stack detection in §3.2, Stack-`+` gating in §3.4a, the §6.1 peer-detection rule) read `node.addedToStack ?? false` rather than `node.addedToStack` directly — TypeScript treats absent fields as `undefined` at the type level (since the field is required after the V1.1 schema change, but V1.0-created nodes loaded from sessionStorage won't have it). The `?? false` is the entire migration cost: no schema-rewrite script, no version field, no migration timestamp. V1.0 nodes correctly read as "not operator-stacked" (which is true — V1.0 had no Stack-`+` to set them). +> +> The V1.0 PR set does NOT include this field; the V1.1 PR set adds it as a non-breaking type extension. + +The V1.1 type: + +```ts +export interface ConversationTreeNodeBase { + // ... existing fields ... + + /** + * True iff this node was created as part of a Stack-`+` operation that added + * N>=2 synchronized peers at once (the §5.6 fan-through case). Default + * false. Set at creation; never auto-flipped. Carried across `branchFromNode` + * clones via deep-copy. + * + * Stack peer-detection is DERIVED (no stored grouping UUID). See §3.2: + * two nodes are Synced-Peers Stack peers iff + * (a) both have addedToStack=true, + * (b) walking up their parent chains they reach the same nearest FanNode + * ancestor at the same depth below it, + * (c) their params are deeply equal (divergence is implicit, no flag). + * + * Stack-`+` on a Fan card is gated (§3.4a): once any synced-peer layer + * exists under a Fan, the Fan's Stack-`+` disables and the chain extends + * via the new Synced-Peers Stack's own Stack-`+`. This guarantees one + * synced-peer set per fan layer. + * + * In Promoted state (per §3.3), Stack-`+` adds N symmetric peers (not a + * singleton): the promoted peer's child is non-draft, the N-1 others are + * draft placeholders. Unpick activates the placeholders so the Stack + * becomes a real Synced-Peers Stack across all N. + * + * Fan-axis children NEVER get addedToStack=true. They are visually grouped + * by the separate Fan-Children Stack render rule (§3.1). + */ + addedToStack: boolean +} +``` + +**Why it must live in the conversation tree model and not just in render state:** + +- It persists across edits and reloads (V2): the field records *how the node was created*, which is durable provenance. +- The runner reads it when servicing `refreshSubtree` to optionally bundle synced peers into one wave. +- `branchFromNode` deep-copies it; clones preserve which nodes were operator-stacked and which were fan-children. + +**Why we dropped `syncGroupId`** (the revision 6 design): the only source of "synced peers" is operator-driven Stack-`+`; everything else is structural. A stored grouping UUID added a field operators never see, required cloning gymnastics, and obscured the fact that divergence is just "params differ" — derivable, not stored. + +### 6.2 `promotedChildSlotIndex` on `FanNode.params` + +```ts +export interface FanNode extends ConversationTreeNodeBase { + kind: 'fan' + params: { + // ... existing fields (axis, variants, mode) ... + + /** + * Optional: the slotIndex of one child to mark as "promoted" (the git + * cherry-pick analogue, §3.5). UI renders the promoted child at full + * opacity + highlight border; other children dim to ~40% opacity + * ("frozen" — not deleted, not editable, no new synced children). + * Set by the "Pick" affordance; cleared by "Unpick". Promotion is per- + * FanNode and does not cascade through nested fans (each FanNode owns + * its own promotion state). Null = all children synced (default). + */ + promotedChildSlotIndex: number | null + + /** + * Tombstone list — slotIndices that have been deleted. Per [01 §5.1 + * invariant 2](01_tree_primitives.md#51-invariants), deleted children's + * indices do not get reused. Makes the invariant runtime-checkable. + */ + deletedSlotIndices: number[] + } +} +``` + +**Promotion is purely a UI/editing concern.** The runner ignores `promotedChildSlotIndex` and always refreshes every stale descendant. Operators who want "only refresh the promoted path" use a per-call option (`refreshSubtree(id, { promotedOnly: true })`), not this field. + +### 6.3 Suggested update to §8.2 of primitives + +Already applied in revision 4: §8.2 now recommends **Buchheim-Walker via `d3-hierarchy.tree()`** + main-path pinning + adaptive stack collapse. Bundle delta: +10 KB for `d3-hierarchy`. Code delta: ~80 LOC for main-path pinning, replacing the ~50 LOC of naïve DFS. + +### 6.4 Suggested update to §6.5 of primitives (Branch from node) + +Applied in revision 7: §6.5 of primitives defined a single primitive `branchFromNode(nodeId)`. **Revision 14 split it into two explicit functions** — `branchToNewTree(nodeId)` (V1.0/V1.1 always-new-tree variant) and `branchToSubtree(nodeId)` (V1.1 sibling-subtree variant) — forcing call sites to be explicit about landing mode. The split is per reviewer guidance: the two operations differ in return type, version-scope, and downstream invariants; a single-function-with-flag would hide silent call-site bugs. UI labels still disambiguate: "Clone tree" on root, "Branch from here" otherwise (both invoke `branchToNewTree`); the V1.1 `🌿` icon invokes `branchToSubtree`. V1.0 ships the V1.0 surface; V1.1 adds `branchToSubtree` non-breakingly. + +### 6.5 Git mental model + +The git-vocabulary table lives in [01_tree_primitives.md §6.8](01_tree_primitives.md#68-git-mental-model-for-operator-vocabulary). Primary UI button labels in this doc use the friendly verbs that match the API surface (`Refresh node` / `Refresh subtree` / `Refresh tree`). Git terminology surfaces only for execution-history concepts that have no equally-concise English equivalent: `Reflog` (`Past runs` tab), `Cherry-pick` (Stack picks), `Clone Tree`, `Checkout this run`, `Make current`. The data model keeps its existing names (`conversation_tree_id`, `ExecutionRecord`, `executionHistory`, `refreshSubtree`). + +--- + +## 7. Decisions and Open Questions + +### Version-scope summary (this round) + +The revision-7 decisions below are unchanged; revision 8 layers V1.0/V1.1 scope on top per the [01 §1 V1.0 exclusions](01_tree_primitives.md#v10-explicit-exclusions-deferred-to-v11). The decisions are about *whether* and *how*; the version markers are about *when*. None of the V1.1 exclusions changes any decision below — V1.1 ships them as the decisions specify, just later than V1.0. + +### Resolved (this round) + +**A.1 — Snapshot `conversation_tree_id` policy → Fresh `conversation_tree_id` with `parent_conversation_tree_id` back-link.** When the operator clones a tree (snapshot-at-root) or snapshots a subtree, the new conversation tree nodes are tagged with a fresh `conversation_tree_id` and an additional `parent_conversation_tree_id` label pointing at the source. Consequences: + +- History filter by `conversation_tree_id` shows only ARs born under that tree (cleanly separated views per workspace). +- History filter by `parent_conversation_tree_id = T` shows all clones derived from `T` (the "where did I fork this from" navigation). +- Two clones can be browsed side-by-side without contaminating either's history view. +- The git framing in §3.5 is faithful: each tree is its own branch with its own ref history; the parent pointer is the equivalent of `branch..merge` configuration. + +This replaces revision 3's "same conversation_tree_id" idea (which would have made the History tab confusing as soon as the operator started cloning). + +**A.2 — "Pick one" cost → Orphan from conversation tree only; no new labels.** Picking a Stack member does not introduce any backend-visible distinction between the picked and frozen members — they all stay queryable in History under the same `conversation_tree_id`. The operator's UI surfaces the choice (highlight + dim), and that's the entire story. **Pursuing multiple "picked" responses in parallel uses `branchToNewTree(treeRoot)` (§5.11), not a multi-promoted primitive.** Promotion stays single-valued per FanNode; branching is the answer when the operator wants "but I also want to see what attempt #7 leads to". + +This honors the user's "just modifying the linking, not copying the commits" intuition: a cloned tree initially references all the same `ExecutionRecord`s as the original — the divergence happens at edit/re-run time, not at clone time. + +**A.3 — Onboarding overlay → Not pursued.** Per the user: no. The `+` chip behavior is discoverable through hover and is consistent with whiteboard/canvas tools the target operator population already uses (Miro, FigJam, Linear's workflows). Skip the overlay. + +**A.4 - Stack `+` vs. per-child `+` ambiguity → Promotion state + one-per-fan-layer gating disambiguates.** When the Stack is in **Synced** state, the Stack `+` (filled blue, at the Stack's bottom edge) is the only `+` visible and unambiguously means "add a synced peer set at this depth". When a member is **Promoted**, the Stack `+` stays put and now adds N symmetric peers but only the promoted one is non-draft (§3.3). Per-child `+` chips on expanded Stack rendering remain grey-on-card to distinguish from the blue Stack `+`. Per Q.3.4 (revision 7), the **fan's** Stack-`+` disables once a synced-peer layer exists under it (§3.4a) - the chain extends downward from the new Synced-Peers Stack's own `+`, not from the fan. This collapses the previous three-affordance model into one Stack `+` whose meaning is read from the visual context (which member is highlighted) and whose presence is gated to one per fan layer, eliminating the "two batches merge" surprise. + +**A.5 — Mobile / narrow viewport → Out of scope for V1; long-term whiteboard vision noted in §9.** Per the user: do not worry about this now. The aspirational direction is a navigable canvas (Miro-style pan/zoom, free node positioning, multi-tree workspace). React-flow already supports the canvas mechanics; the whiteboard polish is a follow-up doc. + +### Resolved this round + +**A.6 — Worktree data model.** Adopted formally in [01_tree_primitives.md §13](01_tree_primitives.md#13-workspace-and-worktrees---the-data-model). Workspace = `{ conversationTrees: ConversationTree[]; activeConversationTreeId }`; tab strip in the 'tree' view; `branchFromNode` (§6.5) creates a new ConversationTree tab. Rejected: per-node `frozen` flag (branching is the answer), full conversation tree version log (V2+). + +**Q.7.B — Reflog browsing → in-place ⟲ badge + drawer tab (both).** Per the user's revision-5 input: surface the reflog as a visible icon on the node *and* in the drawer. Spec: +- **On the node card:** a small `⟲ N` badge appears in the node's footer when `executionHistory.length > 0`. Clicking opens an in-place popover listing past runs (timestamp + truncated output preview). Clicking a past-run row in the popover enters detached state (see Q.7.C). +- **In the drawer:** the right-side drawer (§2.3) gains a "Past runs" tab next to "Current" and "Compare". Same content as the popover but with full output rendering, scoring details, and an explicit "Make current" affordance per row. +- The in-place badge keeps the reflog discoverable without forcing a drawer open. The drawer is for deeper inspection and the "Make current" destructive op. + +**Q.7.C — Detached HEAD safety → (a) silently re-tip with a toast.** Per the user's `⟲` suggestion, the visual entry point is the same icon used for Q.7.B. Spec: +1. Operator clicks the `⟲ N` badge → popover lists past runs (newest first). +2. Operator clicks a past run → node enters **detached** rendering: dotted border, small "Detached" pill, a "Make current" button visible in the drawer's reflog tab. +3. While detached, the displayed `execution` is the past run (read-only inspection). The node's actual `execution` field is unchanged. +4. If operator clicks `↻` (Refresh) while detached: + - Default: silently creates a new tip (new `ExecutionRecord` from the current resolved input), exits detached state, surfaces a toast "*Created new run #N. The detached past run is still in this node's reflog.*" + - Operator's prior detached selection is preserved in the reflog (it never left). + - This is git's `checkout -b new && commit` semantics, packaged as one click, with the safety net that nothing becomes unreachable. +5. To make the detached selection the current execution destructively, operator clicks "Make current" (the `git reset --hard` analogue). Confirmation modal: "*This will replace the current run. The previous run will move into the reflog.*" + +The toast on auto-re-tip is the key affordance — it makes the safety semantics visible without modal interruption. Operators learn the model from the toast text after one or two encounters. + +### Remaining open questions + +**Q.7.A — "Rebase" / "Refresh" terminology — DECIDED V1.0: friendly-first.** Primary UI button labels read `Refresh node` / `Refresh subtree` / `Refresh tree`, matching the API surface (`refreshNode` / `refreshSubtree` / `refreshTree`). Git terminology survives for execution-history concepts with no equally-concise English equivalent: `Reflog` (`Past runs` tab title), `Cherry-pick` (Stack picks), `Detached HEAD` (past-run inspection state), `Make current` (promotion from reflog), `Clone tree` / `Branch from here` (branching). The *rebase concept* remains the mental model explained in [01 §6.8](01_tree_primitives.md#68-git-mental-model-for-operator-vocabulary) — what Refresh does to stale descendants — but is not a button label. + +**V1.x follow-up (deferred):** the originally-brainstormed right-click "Rebase subtree" alias on the per-node context menu is deferred. Operators who want the git surface get it through the conceptual section, the reflog/cherry-pick tab titles, and tooltip text on the Refresh buttons that names the git equivalent. The choice is reversible: a single `terminology.ts` module mapping operation IDs to (primary label, alias label, tooltip text) tuples can A/B-test git-first labels post-launch if operator feedback warrants. Originally V1 PR scope per the brainstorm below; reduced to V1.x to keep V1.0's primary-label surface uniform. + +**Brainstorm (preserved for historical context; verdict in bold):** + +| Operation | **Friendly-first (DECIDED V1.0)** | "Git first" (rejected) | Mixed (rejected) | +|---|---|---|---| +| `refreshSubtree` (button label) | **`Refresh subtree`** | `Rebase subtree` | Default to context: "Refresh" on a fresh subtree, "Rebase" when descendants are stale | +| `refreshSubtree` (right-click alias) | (V1.x: optional `Rebase subtree` alias) | — | — | +| `executionHistory` browsing | **`Past runs (N)`** | `Reflog (N)` | `Past runs (Reflog)` — both terms in the tab title | +| Stack `Pick` (button) | **`Pick this run`** (V1.x: alias `Cherry-pick`) | `Cherry-pick this run` | `Pick (cherry-pick)` | +| Detached state | **`Viewing past run`** | `Detached HEAD` | `Viewing past run (detached)` | +| `branchToNewTree(root)` | **`Clone tree`** | `git checkout -b` / `git worktree add` | Always opens a new tree | +| `branchToNewTree(non-root)` | **`Branch from here`** | `git branch ` | Always opens a new tree | + +*Author lean: **friendly-first labels in the primary UI; git verbs surface in three places only** — (1) right-click aliases on the same action (V1.x), (2) the tab title for past runs ("Past runs (Reflog)" so the term is teachable), (3) tooltips on the friendly buttons that name the git equivalent for users who already know the model.* This gives discoverability without overwhelming operators who don't think in git. The choice is reversible: a single i18n table flip switches between modes, so we can A/B test post-launch. + +**Followup PR scope** when the V1.x right-click aliases get picked up: a small `terminology.ts` module mapping operation IDs to (primary label, alias label, tooltip text) tuples. Every UI surface reads from it. Switching modes globally then becomes one line. + +**Q.7.D — "Discard from history" affordance (V1.x roadmap).** Exploration-heavy workflows produce a lot of history rows: a 200-leaf tree where the operator finds 5 interesting and discards 195 leaves still leaves 195 ARs in History with no operator-facing way to mark them as exploration noise. The §15.1 audit posture requires we **keep** the backend rows (never hard-delete), but a soft "Discard from History default view" affordance would let operators clean up the History tab's default scrollback. + +*Lean (V1.x):* add a `labels.discarded_from_history: "true"` AR label, settable from the tree-view's `🗑 Delete` confirmation modal ("Also hide N AttackResults from default History view? They remain queryable via Show discarded toggle."). The History tab's default filter excludes `discarded_from_history=true`; a "Show discarded" toggle lifts the filter. No backend changes; one extra label. + +*Why V1.x and not V1.0:* not blocking V1.0 release (operators can ignore discarded rows for the first month), and the affordance design wants to be informed by real History-tab usage patterns after the tree-UI ships. + +--- + +## 8. Reviewing Refresh Waves + +When the operator refreshes a 60-leaf tree, they get 60 new ExecutionRecords across many leaves. Without grouping these become an unsorted soup of UUIDs. This section is the UX side of [01_tree_primitives.md §14 (Refresh Waves)](01_tree_primitives.md#14-refresh-waves---grouping-per-node-executions-into-a-user-intent-unit), which adds the `waveId` to the data model. With one shared `waveId` per refresh call, three layered views become tractable. + +### 8.1 The V1 chain: preview banner → confirm modal → toast → drawer panel + +Four lightweight UX surfaces, ordered by when the operator encounters them: + +**Before the refresh — preview banner.** The propagation pulse from §5.9 already makes "X nodes will be affected" visible. The canvas-top ribbon adds an explicit numeric line and a "Refresh tree" button. The preview reads: *"1 edited, 60 stale · estimated 60 target calls · [Refresh tree]"*. The estimate is the count of `Send` nodes in the edited+stale set times the max attempts each could trigger — accurate enough for a sanity check. + +**Before the refresh — confirmation modal (count-based threshold).** When the operator clicks `[Refresh tree]` and the estimated call count exceeds `confirmThresholdCount` (default **20**, configurable in workspace settings), a modal intercepts the click: + +``` +┌────────────────────────────────────────────┐ +│ Refresh 60 leaves? │ +│ │ +│ This will send 60 calls to gpt-4o │ +│ (threshold: 20 calls per refresh) │ +│ │ +│ [ ] Don't ask again this session │ +│ │ +│ [Cancel] [Refresh →] │ +└──────────────────────────────────────────────┘ +``` + +If the refresh spans multiple targets (cross-target `FanNode` per §9.2), the modal breaks down the count per target: *"40 calls to gpt-4o + 20 calls to claude-3.5-sonnet"*. The "Don't ask again this session" checkbox suppresses the modal until the operator reloads or until a 2× safety floor (the modal always fires for >`2 × confirmThresholdCount` even with the checkbox set). + +Waves below the threshold skip the modal entirely — small refreshes stay one-click. + +**During the refresh — in-canvas progress.** Per §5.14, affected nodes animate `stale → running → clean/failed`. The ribbon shows `[ ●●●●●●○○○○ ] 6/60 (3 ✓, 0 ⚠, 1 ●)` so the operator can see progress without watching every node. + +**After the refresh — wave completion toast.** Bottom-right toast: *"Wave complete: 57 ✓, 3 ⚠, 0 cancelled. [View wave] [Dismiss]"*. The toast auto-dismisses after 8 seconds; the "View wave" link remains accessible via the Recent waves drawer tab (§8.2). + +This four-step chain is the minimum-viable answer to "what just happened." It costs ~200 LOC: the ribbon counter, the confirmation modal, the toast component, and the wave-state tracking. No new views. + +**Roadmap: cost-based threshold (V1.x).** V1 ships with a **count-based** threshold only. The same modal scaffold can later carry a per-target `estimatedCostPerCallUSD` field (operator-typed at target-create time) and a `confirmThresholdUSD` cap that triggers the modal independently of the call count. Surfaced as *"Estimated cost: ~$3.20 (cap: $1.00)"* in the modal body. Out of V1 scope to keep the first PR small; revisit when operators ask for it or after the first credit-card-blowing refresh reported in the wild. + +### 8.1a Detached HEAD on a `failed` node (V1.0) + +A `failed` node has `node.execution = null` per [01 §6.4.1](01_tree_primitives.md#641-why-nodeexecution--null-on-failure-not-preserved) but its `executionHistory` may still contain prior successful runs. The reflog badge (§8.2's `➺ N` per-node footer) still shows; clicking it lets the operator inspect those prior runs. The detached state on a failed node renders specially: + +- **Dotted border** (same as detached on a clean node) plus **a red error chip** showing `node.lastError` (per [03 §2.2 sink](03_runner.md#22-state-update-plumbing)). +- **The "Make current" button is enabled** even though current `execution` is null — the `makeCurrent` step-0 guard in [01 §6.7](01_tree_primitives.md#67-makecurrent---destructive-promotion-from-the-reflog) handles the null source. Promoting transitions the node from `failed` to `clean` and clears `lastError`. **Operator surface:** the modal reads *"Promote this past run to current? The node will transition from failed to clean; the most recent failure detail (`{node.lastError}`) will be discarded. Descendants will become stale and need a rebase."* +- **No "silent re-tip" affordance** — the §8.1 / Q.7.C re-tip path requires a current execution to displace into the reflog. For a `failed` node, the equivalent is just `refreshNode(id)` (rebase the node), which fires a normal dispatch. The detached panel surfaces a `[Rebase node]` button next to `[Make current]` for the operator who wants "try again with current params" rather than "go back to this past attempt." +- **Reflog-empty failed node:** the badge does not appear (no past runs to detach to). The drawer's "Past runs" tab shows *"No past runs. Use Rebase to retry."* + +### 8.2 The V1 drawer: a "Recent waves" tab + +The right-side drawer (already present, hosting the per-node "Past runs" tab per Q.7.B) gains a sibling tab: **"Recent waves"** (ConversationTree-scoped). The tab is sorted newest-first and shows: + +``` +Recent waves (this ConversationTree) +──────────────────────────────────────────── +⟲ abc123 2 min ago + Trigger: RootPrompt (edit) + 60 leaves: 57 ✓ · 3 ⚠ · 0 cancelled + [Highlight in canvas] [Open compare] (V2) + +⟲ def456 1 hour ago + Trigger: UserTurn #2 (subtree) + 15 leaves: 15 ✓ + [Highlight] [Open compare] + +⟲ ghi789 2 hours ago + Trigger: refreshTree + 30 leaves: 28 ✓ · 2 ⚠ + ... +``` + +**"Highlight in canvas"** dims all nodes *not* touched by the wave, keeping only affected nodes at full opacity. The operator can click any highlighted node to see its individual reflog entry from this wave. Clicking "Highlight" a second time (or pressing Esc) restores the normal view. + +**"Open compare"** is V2 (see §8.5). + +Implementation cost: ~80 LOC of UI on top of the existing drawer. The data is already there once `waveId` is stamped. + +### 8.3 The V1.x cross-tree view: History tab gains "Group by wave" + +The existing History tab in the sidebar ([AttackHistory.tsx](../../../frontend/src/components/History/AttackHistory.tsx)) already lists `AttackResult`s with filter chips for operator, operation, attack type, outcome, and converters. The `wave_id` label is just another label — the History tab's existing labels-filter machinery picks it up for free. + +Two additions: + +1. **A new filter chip "Wave"** alongside the existing ones. Picks up `labels.wave_id` values seen in the user's recent ARs (the backend's `/labels` endpoint already returns these). Selecting a wave filters the AR list down. +2. **A "Group by wave" toggle** in the filter bar. When on, AR rows collapse into wave-group rows showing `wave_id` short suffix, timestamp, trigger ConversationTree/node ID, aggregate outcome counts, and an expand chevron. Operators see "the last 5 waves across all my worktrees" rather than "the last 300 individual ARs." + +Wave rows include an "Open in tree" button that opens (or focuses) the originating ConversationTree in the `'tree'` tab with the wave's highlight pre-applied (per §8.2). + +This is **the cross-tree answer**: don't build a new view; teach the History tab one new grouping. Operators already know History. + +### 8.4 What "digestible" actually means at scale + +The user's question framed digestibility around "redo an early message in a large tree." The numbers that matter: + +| Workspace size | Wave-affected leaves | UI treatment | +|---|---|---| +| 1 wave, 1-3 leaves | 1-3 | Inline highlight + toast. No drawer panel needed unless the operator opens it. | +| 1 wave, 4-30 leaves | 4-30 | Toast + Recent waves panel default-opens on completion | +| 1 wave, 31-200 leaves | 31-200 | Toast + Recent waves panel + offer "Highlight in canvas" automatically; recommend "Compare to previous wave" (V2) once available | +| 1 wave, >200 leaves | >200 | Soft cap from §9.4 of primitives already triggers an "explicit override" prompt; the wave UX inherits the cap | +| N waves across M conversation trees, recent | All sizes | History tab "Group by wave" surfaces them at workspace level | +| N waves across M conversation trees, historical | All sizes | History tab filter by `conversation_tree_id` + date range; wave grouping still applies | + +The key UX principle: **the operator never sees raw ExecutionRecords as a flat list**. The minimum aggregation is the wave; the workspace aggregation is the History tab. + +### 8.5 V2: tree-local diff view (per-wave compare) + +For the heaviest "what actually changed" question — "the model said X before my edit; now it says Y; was the difference what I hoped for?" — V2 introduces a **compare mode** on the canvas. + +Operator clicks "Compare to previous wave" in §8.2. The canvas re-renders each node card as a vertical split: previous wave's response on the left, current wave's response on the right. Stable nodes (unchanged across waves) collapse to a single read-only card. Failed nodes show the failure side-by-side with the prior success. Operators can click any card to expand to a full diff in the drawer. + +Compare mode is non-destructive — it's a different view of the same data, toggleable. V2 because it requires diff rendering primitives and careful UX for multi-modal content (images, audio, video). + +### 8.6 V2: workspace timeline (swimlanes per ConversationTree, waves as stripes) + +When the operator wants a bird's-eye view of all activity across all worktrees, V2 introduces a **Workspace Timeline** view. Each ConversationTree is a horizontal swimlane; the time axis runs left-to-right; each wave renders as a colored stripe spanning the lane positions of its affected leaves. Color encodes wave outcome (green = all ✓, yellow = mixed, red = mostly ⚠). + +The timeline doubles as a workspace-wide undo/redo affordance — clicking an old wave on a lane opens that ConversationTree with the wave selected. Server-side conversation tree persistence (§11 of primitives) is a prerequisite because workspace-spanning state has to survive a reload. + +This is V2 territory specifically because the data model (waveId + conversation_tree_id + workspace) is V1, but the cross-lane visualization is the kind of thing where polish matters and we want to ship the simpler History-tab grouping first to learn what operators actually need. + +--- + +## 9. Long-term vision: navigable whiteboard canvas + +The user's revision-4 Q.A.5 named the aspirational direction: a navigable canvas like a whiteboard or other flow chart editor. The revision-5 worktree adoption ([01_tree_primitives.md §13](01_tree_primitives.md#13-workspace-and-worktrees---the-data-model)) **already promotes multi-tree workspaces from aspirational to V1**. The remaining items in this section are V1.x and beyond. + +**What V1 already supports** (via react-flow's built-ins + revision-5 worktrees): +- Infinite canvas with pan (drag) + zoom (scroll/pinch). +- Minimap (§2.3) with viewport rectangle. +- Fit-to-view (`F` keyboard shortcut). +- Multi-select (lasso) and group operations. +- **Multi-tree workspaces** — each ConversationTree is its own tab in the 'tree' view (per [01 §13](01_tree_primitives.md#13-workspace-and-worktrees---the-data-model)). Clone Tree opens a new tab; closing a tab drops it from React state; History → "Open as tree" creates one. Each ConversationTree has its own viewport and selection state, persisted in the Workspace's React state for the session. + +**What "feels like a whiteboard" adds beyond V1:** + +- **Operator-positioned nodes.** Pure layout algorithms are great until the operator wants to manually reorganize. A "free-positioning" mode where Buchheim-Walker becomes a starting hint (operator can drag nodes to override) is the natural next step. *V1.x; complexity is in re-running layout when topology changes without trampling manual positions.* +- **Multi-ConversationTree canvas merge.** Today each ConversationTree is its own tab (separate canvas). A "show all conversation trees on one canvas" view (Miro-style) for cross-tree comparison would be useful for retrospectives. Display-only; no data-model change. *V1.x.* +- **Sticky notes and grouping rectangles.** "I want to annotate this subtree as 'jailbreak attempts' and that one as 'baseline'". Pure visual; no data-model change. *V2.* +- **Connector overlays (non-tree).** Visual arrows that operators draw to indicate "this came from that observation", outside the conversation tree. Annotation only; not execution-relevant. *V2.* +- **Multi-operator presence cursors.** Once V2 server-side conversation trees land (§11 of primitives), real-time collaborative editing with operator cursors becomes feasible. *V2.x.* +- **Snapshot to image/SVG.** Export the canvas as a static image for sharing in incident reports or post-mortems. *V1.x; trivial with react-flow's built-in viewport-to-image.* +- **Cross-ConversationTree rebase** ("apply this prompt change to all my experiments"). *V2.1+; requires preview UX to avoid surprising the operator with mass changes.* + +None of these change the V1 conversation tree primitives. They are pure UI/UX layered on top of the existing `ConversationTreeNode` + `ConversationTreeEdge` + `conversation_tree_id` + `Workspace` model. The whiteboard direction is compatible with everything in this doc. + +--- + +## 10. What This Doc Does Not Cover + +- **Visual style** (colors, typography, spacing): a follow-up. +- **Onboarding / first-run experience**: a follow-up. +- **Telemetry events** to instrument operator behavior: a follow-up. +- **Keyboard-only operation specification beyond §8.4 of primitives**: a follow-up, blocked on the visual style decision (focus rings depend on the theme). + +--- + +## Summary Table + +Version column reflects the V1.0 cut decisions from this round (see [01 §1 V1.0 exclusions](01_tree_primitives.md#v10-explicit-exclusions-deferred-to-v11)). Rows marked V1.1 have a documented V1.0 fallback in §5's scenario→version map. + +| User intent | UI primitive | Git verb | ConversationTree-level operation | Version | +|---|---|---|---|---| +| Send a prompt | RootPrompt + auto-Send | (initial commit) | `addNode(root_prompt); refreshNode(send)` | V1.0 | +| Continue conversation | Edge `+` → "Follow-up" | (new commit on branch) | `insertChild(user_turn); refresh` | V1.0 | +| Re-roll response | Node `↻` | (new commit; old in reflog) | `refreshNode(send)`; old execution → reflog | V1.0 | +| Try N times | Node `↻×N` | (fan-out branches) | `wrapInFan(axis='attempt')` | V1.0 | +| Pick one of N | Stack `🎯` | cherry-pick | Set `FanNode.params.promotedChildSlotIndex` | V1.0 (visual dim only; draft-placeholder dance is V1.1) | +| Unpick (back to synced) | Stack right-click → Unpick | (revert cherry-pick) | Clear `promotedChildSlotIndex` | V1.0 | +| Follow-up to all peers | Stack `+` (Synced state, once per fan layer per §3.4a) | (commit on each branch) | Add synced child to each peer | **V1.1** | +| Follow-up to picked only | Stack `+` (Promoted state) | (commit on selected branch) | Add N symmetric peers; promoted is non-draft, N-1 are draft placeholders (§3.3) | **V1.1** | +| Try N converters | Node `🔀` → axis=converter | (fan-out branches) | `wrapInFan(axis='converter')` | V1.0 | +| Try N targets | Node `🔀` → axis=target | (fan-out branches; new ARs) | `wrapInFan(axis='target')` | **V1.1** | +| Edit upstream | Node `✏` | (amend or new commit) | `editParams`; descendants → stale | V1.0 | +| Rebase subtree | Right-click → Refresh / shift-`↻` | rebase | `refreshSubtree(id)` | V1.0 | +| Branch from node | Node `📋` | `git branch ` | `branchToNewTree(id)` (V1.0: swaps active tree; V1.1: new tab in strip) | V1.0 (always-new-tree swap variant; tab strip is V1.1) | +| Branch as subtree | Node `🌿` | `git branch ` (in-canvas) | `branchToSubtree(id)` landing as sibling subtree in same canvas | **V1.1** (V1.0 disabled stub) | +| Clone whole tree | Root `📋` | `git checkout -b new` | `branchToNewTree(root.id)` (degenerate case; same function) | V1.0 (same swap semantics as branch from node) | +| View past run | Node card → reflog drawer → click run | `git checkout ` (detached) | Display-only; node enters detached state | V1.0 | +| Make past run current | Past run → "Make current" | `git reset --hard ` | Swap `execution` with reflog entry | V1.0 | +| Open historical | History → "Open as tree" | (browse a branch) | Auto-reverse (§9.3 of primitives) | V1.0 (linear+converter; fanout detection V1.1) | +| Read linear | Node `🔍` | (log of one branch) | Switch right pane to linear view | V1.0 | +| Delete branch | Node `🗑` | (delete branch ref) | Remove tree nodes; backend ARs preserved | V1.0 | +| Review a refresh | Toast → "View wave" / drawer "Recent waves" tab | (read `git log `) | Filter ExecutionRecords by `waveId` (§8.1, §8.2) | V1.0 | +| Cross-ConversationTree wave search | History tab → "Group by wave" toggle (V1.x) | (`git log --all`) | SQL group by `labels.wave_id` over all ARs (§8.3) | V1.x (depends on Workspace + History extension) | +| Compare current to previous wave | Drawer "Compare" tab (V2) | (`git diff `) | Per-node diff over last two `waveId`s (§8.5) | V2 | diff --git a/doc/gui/design/03_runner.md b/doc/gui/design/03_runner.md new file mode 100644 index 0000000000..94fbb63fc2 --- /dev/null +++ b/doc/gui/design/03_runner.md @@ -0,0 +1,1273 @@ +# Tree-Based UI — Runner Spec (V1.0 stub) + +> Status: **DRAFT stub (revision 18)** — companion to [01_tree_primitives.md](01_tree_primitives.md) and [02_tree_ui_affordances.md](02_tree_ui_affordances.md). This doc is intentionally outline-level. Each section names what the runner does and references the primitives section that decides the *why*; sections marked **TODO:spec** need a focused expansion pass before the runner is implemented. The reviewer's strong recommendation was "write the runner spec before any code" — this stub lets implementers start fanning out (interfaces, state-update plumbing, the dispatch queue) in parallel with the spec-expansion work. +> Rolling revision history lives at [01 §0](01_tree_primitives.md#0-rolling-revision-history); refer there for cross-doc change summaries. The freshest substantive gate items between current state and implementer onboarding are [Q.S.1–Q.S.3](#12-open-questions) below. + +### Version-scope legend + +Shared with [01](01_tree_primitives.md#version-scope-legend) and [02](02_tree_ui_affordances.md#version-scope-legend). V1.0 surface only is fleshed out below; V1.1 deltas (per-Workspace budgeting, Synced-Peers Stack dispatch, multi-tab fair-share) are flagged inline. + +## 1. Goals & Non-Goals + +### Goals + +1. **Translate a ConversationTree into backend calls deterministically.** Same tree shape + same node states → same call sequence (modulo concurrency ordering). No hidden runner heuristics that aren't in the data model. +2. **Honor the V1 contract that nothing fires unless the operator asks.** Edits mark nodes edited/stale (§6.3 of [01](01_tree_primitives.md#63-propagation-rules)); the runner is silent until `refreshNode`, `refreshSubtree`, or `refreshTree` is called. +3. **AR-per-leaf with no backend changes.** Per the materialization rule in [01 §7.1](01_tree_primitives.md#71-conversationtree-operation--backend-call), each leaf `SendNode` dispatch is a **`create_attack` + N `add_message` sequence**: first `POST /api/attacks` ([`create_attack`](../../../pyrit/backend/routes/attacks.py#L184)) to create the AR with the resolved clean-prefix history as `prepended_conversation`, then one `POST /api/attacks/{new_id}/messages` ([`add_message`](../../../pyrit/backend/routes/attacks.py#L432)) per stale Send on the path (in topo order, finishing at the leaf). `create_attack` is context setup; `add_message` with `send=True` is the call that produces the assistant response. The N add_messages re-fire stale interior Sends and the leaf within the same AR — see §3.2 / §3.3 for the partition rule and the deadlock-avoidance reasoning. Existing backend semantics; the runner does not change them. +4. **Bounded concurrency.** `maxParallel=4` (V1.0: per-session; V1.1: per-Workspace with fair-share). The runner is the single chokepoint that enforces this — no other layer should fire backend calls. **Each leaf's full dispatch sequence (`create_attack` + N `add_message`s) counts as one budget slot** held atomically for the duration; all calls in the sequence execute sequentially within the same slot. +5. **Partial-commit on failure.** In-flight calls complete; not-yet-dispatched nodes transition to `cancelled` (§6.4 of [01](01_tree_primitives.md#64-failure--partial-commit-semantics)). +6. **Wave bookkeeping.** Every refresh stamps a fresh `waveId` and a `waveTriggerKind` from the §14.4 enum on each affected `ExecutionRecord` and on each leaf AR's `labels.wave_id` / `labels.wave_trigger_kind` (see §6). + +### Non-Goals + +- **Server-side runner / queue.** V1's runner is a client-side TypeScript module under `frontend/src/runner/` (proposed path). The backend is a stateless target of HTTP calls. The §6.4 partial-commit semantics live in the client because there's no server-side cancellation surface (see §9 and [01 §12.8](01_tree_primitives.md#128-cancellation-deferred---accepted-follow-up-v1x)). +- **Retries with backoff.** The runner does not retry failed calls. The backend's `AttackService` already has [`max_attempts_on_failure`](../../../pyrit/attacks/) at the *per-attack* layer; the runner adds no second retry layer (would compound exponentially in fan-outs). Failed nodes surface to the operator who decides whether to re-trigger. +- **Streaming partial responses.** The runner awaits each backend POST to completion. SSE / WebSocket streaming is a V2 polish item. +- **Cross-tab synchronization.** Two browser tabs with two tree views run independent runners; per [01 §9.4.3](01_tree_primitives.md#943-concurrent-tab-advisory-lock-v10), V1.0 ships a `BroadcastChannel`-based **advisory lock** keyed on `conversation_tree_id` that prevents two tabs from concurrently rebasing the same tree (the dominant fork-bomb risk). The lock is advisory — it bounds the common case without requiring server-side coordination. Full coordination (live state sync, undo/redo across tabs) is V2. +- **Distributed dispatch.** No worker pool, no Web Workers — the runner is one async loop in the main thread. The bottleneck is network I/O, not CPU. **TODO:spec** — benchmark whether the JSON-serialization cost for a 200-message `prepended_conversation` justifies pushing the serialize step to a Worker. Likely "no" for V1.0; revisit if a 60-leaf refresh visibly janks the UI. + +## 2. Surface Area + +### 2.1 Entry points (the public API) + +```ts +// frontend/src/runner/runner.ts (proposed) + +export interface Runner { + /** Refresh exactly one node. Idempotent during a single in-flight call. + * + * V1.0 behavior by node kind ([01 §6.3](01_tree_primitives.md#63-propagation-rules) rule 2): + * - root_prompt / import_message: no dispatch (re-hydrate seed bundle locally). + * - user_turn / score: no dispatch (recompute resolvedInputHash; clean if upstream clean). + * - send (leaf): one dispatch sequence via §3.3. + * - send (interior): aliased to refreshSubtree(id) restricted to descendant leaves — + * per [01 §6.3 rule 2 'send (interior)'](01_tree_primitives.md#63-propagation-rules), the + * runner cannot fast-path a single interior Send because downstream leaf ARs still + * reference the interior's OLD assistant pieces in their prepended_conversation. + * - fan: aliased to refreshSubtree(id) — fan children are typically user_turn nodes, + * and "refreshing" a user_turn is a no-op state recompute. Aliasing to subtree-refresh + * walks every Send descendant under the fan, which is what the ↻ action rail's + * "Refresh all children" tooltip means to the operator. + */ + refreshNode(treeId: ConversationTreeId, nodeId: ConversationTreeNodeId): Promise + + /** Refresh the node and all transitively-stale descendants. The §6.3 propagation + * rules already marked the right set as stale; the runner walks them in topo order. */ + refreshSubtree(treeId: ConversationTreeId, rootNodeId: ConversationTreeNodeId): Promise + + /** Convenience: refreshSubtree(treeId, tree.rootId). */ + refreshTree(treeId: ConversationTreeId): Promise + + /** Cancel the active in-flight wave for this tree (V1.0; UI-level only — flips a per-wave + * flag that the dispatch loop checks at each `ready.popNext()` boundary per §9). In-flight + * HTTP calls complete; not-yet-dispatched leaves transition to `cancelled`. Returns when + * the wave fully settles. Does NOT touch queued waves — use `cancelQueued` for those. + * V1.x adds backend-token cancellation that aborts in-flight calls. */ + cancelWave(treeId: ConversationTreeId): Promise + + /** Drop every queued (not-yet-active) wave for this tree (V1.0; per [§10.3](#103-backpressure-per-tree-wave-queue)). + * Does NOT affect the active wave — use `cancelWave` for that. Resolves immediately; + * dropped waves emit a `WaveEvent { kind: 'complete', summary.cancelled: }` + * so the UI reconciles their queued banner state. */ + cancelQueued(treeId: ConversationTreeId): Promise + + /** Retry a specific set of leaves (V1.0; called by the [02 §5.14](02_tree_ui_affordances.md#514-partial-failure-mid-refresh) `[Retry failed]` + * toast button). `nodeIds` is captured by the UI at wave-complete time — the union of + * the wave's failed leaves (any `failure_class` except `permanent`) plus its `blocked` + * leaves. The runner builds `S` for this wave as: those nodeIds themselves PLUS any + * `failed`/`cancelled` Send ancestors on each nodeId's root-to-leaf path (so the + * [§3.1 step 2b retry-failed demotion](#31-topological-walk) can flip them back to + * `stale` and the path becomes dispatchable). `waveTriggerKind = 'retry_failed'`. + * + * Distinct from `refreshSubtree(rootId)` because the retry is scoped to wave-W's + * victims, not the whole tree — an operator who edited an unrelated node between + * the original wave and the retry click does NOT have that edit swept up by retry. + * The toast captures `nodeIds` at completion time so this scope is stable even if + * the operator edits the tree before clicking. */ + retryFailedNodes(treeId: ConversationTreeId, nodeIds: ConversationTreeNodeId[]): Promise +} +``` + +All three refresh methods return a `Promise` that resolves when the wave is *settled* (every dispatched call has terminated — succeeded, failed, or cancelled). Per-node state updates flow through the React state container during the wave; callers `await` only when they need to know the wave is over (e.g., for telemetry or test assertions). + +#### Entry-point shim ordering (V1.0) + +Each `refresh*` method is implemented by an **entry-point shim** that runs five steps in a fixed order *before* the dispatch loop in [§3.1](#31-topological-walk) executes. Steps 2-5 are wrapped in `try { ... } finally { lockManager.release(treeId) }` so the cross-tab lock is released on every exit path — success, failure, cancel, OR early-return from the tag-hygiene gate or wave-queue check. + +```ts +async function refreshSubtree(treeId, rootNodeId, triggerKind) { // mirror for refreshNode / refreshTree + // 1. Tag-hygiene gate (runs BEFORE lock acquire so a tag-missing operator does + // not lock out other tabs while seeing the modal). Per [§3.1 step 0 reframe](#31-topological-walk). + const operator = currentOperator() + if (!operator) { + sink.emitWaveEvent({ kind: 'operator_tag_required', treeId }) + return // wave never starts; no lock acquired, no cost modal, no node state mutated + } + + // 2. Cross-tab advisory lock (§10.4). Acquire BEFORE the cost modal so a second + // tab can't sneak in while the operator reads the cost confirmation. The + // try/finally below guarantees release on every exit path. + const lock = await lockManager.acquire(treeId) + if (lock === 'busy') { + sink.emitWaveEvent({ kind: 'busy', treeId, holderTabId: ... }) + return // no lock acquired, nothing to release + } + + try { + // 3. Cost guardrail (§2.3). Operator may cancel here; the lock release in finally + // runs and the other tab can proceed. + const estimatedCalls = estimate(rootNodeId) + const approved = await costGuardrail.approve(estimatedCalls, triggerKind) + if (!approved) return + + // 4. Per-tree wave-queue check (§10.3). If another wave is active on this tree, + // enqueue this one and return; the lock release in finally fires (the active + // wave holds its own lock acquired earlier). When the active wave settles, + // the queue drain logic re-acquires the lock for each queued wave via this + // same shim. + if (currentWaveByTree.has(treeId)) { + const req = { waveId: uuid(), rootNodeId, triggerKind, enqueuedAt: now() } + queueByTree.get(treeId)?.push(req) ?? queueByTree.set(treeId, [req]) + sink.emitWaveEvent({ kind: 'queued', waveId: req.waveId, treeId, queueDepth: queueByTree.get(treeId)!.length }) + return + } + + // 5. Wave start (§3.1). The dispatch loop runs to settlement; its emitWaveEvent + // `complete` event fires before this function returns. + currentWaveByTree.set(treeId, { rootNodeId, triggerKind }) + try { + await _runDispatchLoop(treeId, rootNodeId, triggerKind) // §3.1 + } finally { + currentWaveByTree.delete(treeId) + } + // Drain queue if non-empty (each queued wave re-enters via the same shim above). + while ((queueByTree.get(treeId) ?? []).length > 0) { + const next = queueByTree.get(treeId)!.shift()! + await refreshSubtree(treeId, next.rootNodeId, next.triggerKind) // re-enters the shim + } + } finally { + lockManager.release(treeId) // unconditional; every exit path releases + } +} +``` + +**Why this ordering.** The five steps run in this order specifically: + +1. **Tag-hygiene gate FIRST.** Operator with no tag set sees the modal before any other UI surface or lock acquire. Reviewer rev-15 spotted that placing this at §3.1's step 0 (the previous spec) caused the cost modal to fire first AND leaked the cross-tab lock on early-return. Moving it to step 1 of the shim fixes both at once. +2. **Lock acquire SECOND.** Cost modal can take seconds for the operator to read; a second tab racing in during that window would otherwise blow `maxParallel` cumulative across tabs. +3. **Cost modal THIRD.** Operator confirms what they're about to spend; cancel returns through finally and releases the lock. +4. **Queue check FOURTH.** Only after cost approval do we decide whether to enqueue (lock is released in finally; the active wave holds its own lock from its earlier shim invocation). Queue semantics (FIFO, no-coalescing, stale-set recomputed at wave-start, banner copy) are spec'd in [§10.3](#103-backpressure-per-tree-wave-queue); this shim is the canonical implementation of that contract. +5. **Wave start FIFTH.** The §3.1 dispatch loop runs; its `complete` event is the natural wave-settle marker that the lock-release finally also covers. + +### 2.2 State-update plumbing + +The runner does not own React state. It receives a `RunnerStateSink` at construction: + +```ts +export interface RunnerStateSink { + /** Move a node into a new lifecycle state (clean/edited/stale/running/failed/cancelled). + * The optional `opts.reason` populates the node's `lastError` field for failed/cancelled + * transitions (per [01 §6.4.1](01_tree_primitives.md#641-why-nodeexecution--null-on-failure-not-preserved)); on transitions away from failed + * (e.g., back to running on retry), the sink clears `lastError`. */ + setNodeState( + treeId: ConversationTreeId, + nodeId: ConversationTreeNodeId, + state: NodeState, + opts?: { reason?: string | ApiErrorReason | null }, + ): void + + /** Attach a fresh ExecutionRecord to a node (also moves prior execution into reflog + * per [01 §6.6](01_tree_primitives.md#66-executionhistory-gc-the-reflog) — wrapping + * the prior execution in a `ReflogEntry` with `pinned=false`). */ + recordExecution(treeId: ConversationTreeId, nodeId: ConversationTreeNodeId, record: ExecutionRecord): void + + /** Null out a node's `execution` field. Called on `failed` and `cancelled` transitions + * per [01 §6.4.1](01_tree_primitives.md#641-why-nodeexecution--null-on-failure-not-preserved). Does NOT touch `executionHistory` + * (the reflog only ever receives executions that completed via `recordExecution`). */ + clearExecution(treeId: ConversationTreeId, nodeId: ConversationTreeNodeId): void + + /** Set or clear the `pinned` flag on a `ReflogEntry` (per [01 §6.6](01_tree_primitives.md#66-executionhistory-gc-the-reflog)). + * Per-tree per-execution; called by the UI when the operator clicks Pin/Unpin in the reflog + * drawer. No-ops if the entry is not in the tree's reflog (e.g., was just evicted). */ + setReflogPinned( + treeId: ConversationTreeId, + nodeId: ConversationTreeNodeId, + executionId: string, + pinned: boolean, + ): void + + /** Emit a wave event (start / per-node-complete / wave-complete) so the UI can + * render the [02 §2.3](02_tree_ui_affordances.md#23-canvas-level-affordances) progress bar and the [02 §8.1](02_tree_ui_affordances.md#81-the-v1-chain-preview-banner--confirm-modal--toast--drawer-panel) toast. */ + emitWaveEvent(event: WaveEvent): void +} +``` + +The sink is the **only** way the runner mutates React state. This boundary keeps the runner unit-testable with a mock sink (see §11) and prevents the temptation to import React hooks inside the dispatch loop. + +**Sink reason semantics (V1.0).** `opts.reason` accepts three shapes: + +- `string` — plain text. Sink normalizes to `{ message: , failure_class: 'transient' }` (defensive default; pre-rev-15 callsites that just passed a string land in `transient`). +- `ApiErrorReason` — the structured `{ message; failure_class }` from [§3.3a `_format_api_error`](#33a-helpers-referenced-by-the-dispatch-step). Sink writes the object directly to `node.lastError`. +- `null` — clear `node.lastError` entirely (set to `null`). Used by the [§3.1 step 2b retry-failed demotion](#31-topological-walk) when flipping `failed`/`cancelled` nodes back to `stale` for a retry wave. Distinct from "omitted" (no `reason` key in `opts`): omitted leaves the existing `lastError` unchanged. The same null-clears-vs-omitted-leaves-unchanged convention applies on `clean` transitions (recordExecution-driven; the sink clears `lastError` implicitly on success). + +**Missing-node tolerance.** All sink mutating methods (`setNodeState`, `recordExecution`, `clearExecution`, `setReflogPinned`) silently no-op when the target node does not exist in the current tree state (e.g., operator deleted the node mid-wave). The runner discovers the deletion at no extra cost — the next `sink.setNodeState` for the deleted node is a no-op, the next `ready.popNext()` ignores deleted nodes, the wave settles without the deleted-node contributions. The sink emits a single telemetry event `node_dispatched_post_delete` per occurrence (sampled), so operators-of-the-runner can detect if the pattern is common in practice. Wave-complete summary counts the deletion-victim as `cancelled` (not as `failed.*` — the operator made the choice; not `clean` — the dispatch didn't complete). + +### 2.3 Cost-guardrail hook + +Before dispatch, the runner consults the count-based guardrail per [02 §8.1](02_tree_ui_affordances.md#81-the-v1-chain-preview-banner--confirm-modal--toast--drawer-panel) (`confirmThresholdCount`, default 20): + +```ts +export interface CostGuardrail { + /** Returns true if the wave is approved (operator clicked through the modal, or + * the count was under threshold). False short-circuits the wave with state unchanged. */ + approve(estimatedCalls: number, waveTriggerKind: WaveTriggerKind): Promise +} +``` + +The estimate (V1.0): **`Σ leaves (count of stale Sends on each leaf's root-to-leaf path)`** — each leaf's dispatch fires one `create_attack` plus N sequential `add_message` calls (per §3.3), and per-leaf paths are dispatched independently. Practical examples: +- Single-leaf, 10-deep stale chain: 10 calls. +- 60-leaf attempt-fan with a clean prefix: 60 calls (each leaf is its own fresh suffix; no shared interior Sends because attempt-fan children diverge at the leaf-Send itself). +- 60-leaf attempt-fan with a 10-deep shared stale prefix: 60 leaves × 10 stale-Sends-per-path = 600 calls. Each leaf re-fires the shared prefix independently. The [02 §8.1](02_tree_ui_affordances.md#81-the-v1-chain-preview-banner--confirm-modal--toast--drawer-panel) cost-guardrail modal (default `confirmThresholdCount = 20`) intercepts and asks the operator to confirm before any backend call. +- 3-leaf prompt-fan with a 5-deep shared stale prefix: 3 leaves × 5 stale-Sends-per-path = 15 calls. (V1.1 — V1.0 ships only `attempt` and `converter` axes per [01 §4.4](01_tree_primitives.md#44-structural-nodes--the-single-fan-out-primitive).) + +The estimator counts what the runner will actually fire — each leaf's dispatch is independent in V1.0. No cost-based variant in V1.0 — see [02 §8.1](02_tree_ui_affordances.md#81-the-v1-chain-preview-banner--confirm-modal--toast--drawer-panel) roadmap note. **Intra-wave memoization** for shared stale interior Sends (which would collapse the 60-leaf/10-deep-shared-prefix case from 600 to 70 calls by regenerating the shared prefix once per wave) was designed in revision 14 and cut in revision 15 per reviewer Finding 2 — see [§12 Q.6](#12-open-questions) for the V1.1 follow-up. + +## 3. The Dispatch Loop + +### 3.1 Topological walk + +``` +Inputs: treeId, set S of in-need-of-dispatch nodes + For refreshNode/refreshSubtree/refreshTree: S = {n : n.state ∈ {'edited','stale','failed','cancelled'} AND n is within scope (subtree root or whole tree)} + For retryFailedNodes(nodeIds): S = {nodeIds} ∪ {failed/cancelled Send ancestors on each nodeId's path} + — scoped to the specific leaves the [Retry failed] toast captured +Outputs: per-node execution updates via RunnerStateSink + +1. waveId ← uuid() +2. waveTriggerKind ← inferred from caller (§6.2 below) +2a. cancelled ← false // per-wave cancel flag; flipped by sink's cancelWave (§9) + // Tag-hygiene gate (formerly step 0) now runs at the [entry-point shim per §2.1](#entry-point-shim-ordering-v10), + // before the cross-tab lock acquire and cost guardrail. By the time the dispatch loop + // runs, `currentOperator()` is non-null/non-empty by construction — no need to re-check + // here, and the previous step-0 lock leak (rev-15 Finding 4) is closed. +2b. // Retry-failed pre-readiness demotion (per §5.3 step 4). + // Without this, S-member failed/cancelled nodes would still be in state + // failed/cancelled when step 3's readiness allowlist runs, and the leaves below + // them would be excluded from `ready` — silently no-op'ing the retry wave. + // Demotion to `stale` puts them in the ancestor allowlist; their leaves enter + // `ready` and dispatch normally; the interior failed Sends are regenerated as + // part of each descendant leaf's fresh suffix per §3.2. + if waveTriggerKind == 'retry_failed': + for n in S where n.state in {'failed', 'cancelled'}: + sink.setNodeState(treeId, n.id, 'stale', opts={'reason': null}) + sink.clearExecution(treeId, n.id) // belt-and-suspenders; already null per [01 §6.4.1](01_tree_primitives.md#641-why-nodeexecution--null-on-failure-not-preserved) +3. ready ← { n ∈ S : n is a leaf Send AND every Send ancestor of n has node.state ∈ {edited, stale, running} or is clean } + // Interior Sends never appear in `ready` — they are dispatched as part of their + // descendant leaf's dispatch sequence per §3.2. The readiness rule for leaves + // checks that the leaf's path is dispatchable: ancestors are either pending in this + // wave (edited/stale, will be regenerated as part of the leaf's dispatch), + // currently dispatching (running, the leaf will be added to `ready` after the + // ancestor's completion), or previously clean (their stored pieces feed + // prepended_conversation). `failed` and `cancelled` ancestors EXCLUDE the leaf from + // `ready` until a separate [Retry failed] wave (§6.2 `waveTriggerKind='retry_failed'`) + // re-admits them; this is the in-flight-cascade contract from §5.3. +4. inflight ← ∅ +5. while ready ≠ ∅ or inflight ≠ ∅: + while |inflight| < maxParallel and ready ≠ ∅: + n ← ready.popNext() // fair-share pick when V1.1; FIFO V1.0 + sink.setNodeState(n, 'running') + promise ← dispatch(n, waveId, waveTriggerKind) + inflight.add(promise) + completed ← await Promise.race(inflight) + inflight.delete(completed.promise) + handleCompletion(completed) // state transition + cascade ready set +6. // Wave-end transform reconcile (per reviewer rev-15 Finding 9 / [§3.3a](#33a-helpers-referenced-by-the-dispatch-step) `reconcileAllTransforms`). + // The per-dispatch `reconcileTransformStates(treeId, path)` calls in §3.3 only touch + // transforms ON the just-completed leaf's root-to-leaf path. ScoreNodes (and any + // UserTurn/Fan) operators attach as SIBLINGS of a Send — the operator-typical + // placement for "score this leaf's response" — are never on a dispatched leaf's path + // and would stay `stale` indefinitely. The wave-end pass walks every node in the tree + // once and applies the same per-node reconcile rule. O(tree-size); negligible at + // typical 60-node trees, bounded by the 1000-node soft cap. + reconcileAllTransforms(treeId) + sink.emitWaveEvent({ kind: 'complete', waveId, summary }) +``` + +**`S = {edited, stale, failed, cancelled}` — failed/cancelled stay in S, but the readiness rule excludes them from the ancestor allowlist.** S still admits failed/cancelled leaves so a separate retry wave (`waveTriggerKind='retry_failed'` per §6.2, triggered by the [02 §5.14](02_tree_ui_affordances.md#514-partial-failure-mid-refresh) toast button) can dispatch them — the leaf itself reads `state ∈ S` and is eligible. **What changed in revision 15 (per reviewer Finding 4):** the ancestor-side allowlist no longer admits `failed`/`cancelled`. An earlier framing accepted any S-member ancestor as "will be regenerated as part of the leaf's dispatch," producing retry amplification where every sibling leaf sharing a transiently-failed ancestor X would independently retry X via `add_message` in its own `fresh_suffix`. Under V1.0's no-backpressure model (Finding 6a) this amplifies a single 5xx into `min(maxParallel, sibling_count)` retries. The new rule blocks descendants of in-wave failures; the operator's [Retry failed] click starts a fresh wave with `S = {failed,cancelled,...}` whose leaves ARE now `failed` (themselves in S) with no in-wave failed-ancestor blocker, so they dispatch normally. See §5.3 for the cascade contract. + +**`ready.popNext()` in V1.0** is FIFO over insertion order (which happens to be topological order). **V1.1** changes this to fair-share across multiple `ConversationTree`s — see §10.2. + +**`handleCompletion`** flips the node to `clean` (on success) or `failed`, and re-evaluates `ready` for any newly-eligible descendant. A descendant becomes eligible when *all* of its parents are in `clean` state. A descendant whose parent failed stays `stale` (per [01 §6.4](01_tree_primitives.md#64-failure--partial-commit-semantics)) and never becomes ready in this wave. + +### 3.2 What gets dispatched + +The dispatch step varies by node kind (see [01 §4](01_tree_primitives.md#4-node-taxonomy) "side-effect class" spine): + +| Side-effect class | Node kinds | Dispatch action | +|---|---|---| +| **Source** | `RootPromptNode`, `ImportMessageNode` | No backend call. State transitions to `clean` immediately; cascade. | +| **Transform** | `UserTurnNode` | No backend call. Pure local computation (resolved input bundle update). Cascade. | +| **Side-effecting** | `SendNode` (leaf or interior) | **Only leaves are picked from the `ready` queue.** A leaf's dispatch fires **one `create_attack` + N `add_message` calls** in sequence (held within one concurrency slot, §10.1) where N = the count of stale `SendNode`s on the leaf's root-to-leaf path (including the leaf itself). Each `add_message` regenerates one Send's assistant pieces; interior Sends on the path transition `running → clean` as their add_message returns. See §3.3 for the partition rule and §4.1 for the resolver. | +| **Structural** | `FanNode` | No backend call. Materializes children if needed; cascade per-child. | +| **Observational** | `ScoreNode` | **V1.0: render-only**, reads upstream `MessagePiece.scores` already attached to ancestor pieces. The runner does not enqueue ScoreNodes and never issues scorer requests. The `✏ Configure scorer` affordance is a disabled stub per [02 §2.2](02_tree_ui_affordances.md#22-per-node-action-rail). State is reconciled by the wave-end [`reconcileAllTransforms`](#33a-helpers-referenced-by-the-dispatch-step) pass at [§3.1 step 6](#31-topological-walk) — ScoreNodes attached as siblings of a Send (the operator-typical placement) are reconciled correctly, not only when they happen to sit on a dispatched leaf's path. **V1.1+:** one POST to a future `/api/scores` route per [01 §4.5](01_tree_primitives.md#45-observational-nodes-no-side-effect-on-the-conversation). **TODO:spec** — wire to the existing scorer service in V1.1. | + +**Interior `SendNode`s never appear in the `ready` queue.** Per the §3.1 readiness rule, a node becomes ready when *every* parent is `clean`. Interior Sends with stale upstream are themselves stale; their leaf descendants then can't become ready (their interior-Send parent isn't `clean`). To avoid the deadlock that would otherwise result, **V1.0 treats every interior Send as part of its descendant leaf's dispatch sequence**, never an independent dispatch. The ready-set computation skips interior Sends entirely — only leaves are picked. When a leaf's dispatch runs, it claims every stale Send on its path (transitioning them `stale → running` together at dispatch start), then transitions each `running → clean` as the corresponding `add_message` returns. The §3.3 dispatch loop spells out the partition. + +**Why not regenerate interior Sends as their own ARs.** Reviewer rev 10 suggested making interior Sends into "mini-leaves" with full `create_attack + add_message` pairs of their own — producing N ARs per chain refresh. Rejected because (a) it breaks AR-per-leaf (`labels.conversation_tree_id` filtering returns N×leaves rows, not leaves), (b) the History view becomes confusing (N rows per leaf with no operator-visible distinction between leaf and interior), and (c) the single-AR-with-N-add_messages model in §3.3 below uses the same total target calls without the AR-row explosion. + +**Leaves with shared interior Sends — each leaf dispatches independently in V1.0.** Two leaves L1, L2 that share a stale interior Send X each regenerate X in their own dispatch sequence: L1 fires `create_attack + N add_message`s with X in its fresh suffix; L2 fires `create_attack + M add_message`s with X *also* in its fresh suffix. The target is called once per leaf for X, not once per wave. For a 60-leaf attempt-fan with a 10-deep shared stale prefix this costs 600 target calls (60 leaves × 10 stale Sends per path) rather than the 70 calls that intra-wave memoization would achieve. + +**Cost ceiling.** The [02 §8.1](02_tree_ui_affordances.md#81-the-v1-chain-preview-banner--confirm-modal--toast--drawer-panel) cost-guardrail modal fires at 20 calls (default `confirmThresholdCount`), so a 600-call refresh is intercepted before any backend call goes out. The operator sees *"Refresh 600 leaves? Estimated 600 target calls. [Refresh] [Cancel]"* and decides. If they need surgical scope, [01 §6.5](01_tree_primitives.md#65-branch-from-node---the-immutable-history-primitive) `branchToNewTree` from a midpoint scopes the refresh to one path. + +**Why this is V1.0-acceptable.** V1.0 ships only the `attempt` and `converter` fan axes ([01 §4.4](01_tree_primitives.md#44-structural-nodes--the-single-fan-out-primitive)). Walk both: attempt-fan children diverge at the leaf-Send (no shared interior Sends to dedupe), and converter-fan children diverge at the converter `UserTurn` (each child's downstream Sends produce different outputs because the input was converted differently). The chain-then-fan tree shape with edits high up the chain — the only shape that benefits — is a real workflow (Crescendo with depth-extension) but not the dominant V1.0 use case. V1.1 may add intra-wave memoization once telemetry quantifies the workflow's prevalence (see [§12 Q.6](#12-open-questions)). + +**Tree-side X state after the wave.** Each leaf's dispatch regenerates X independently. The wave's `recordExecution` for X is determined by last-writer-wins on the leaf completion order; since interior-Send `ExecutionRecord`s collapse into the leaf AR they share, the operator sees the final X execution from whichever leaf completed last. Practically harmless because every leaf's `ExecutionRecord` carries the same `waveId` and reads the same prepended chain; the only operator-visible difference is the `conversation_id` of the leaf AR that owns the displayed X record. + +**Orphan-Send case (Send with no descendants — not just no leaf descendants).** A SendNode with no children at all (operator added a Send, deleted its child UserTurn, never added a replacement) is itself a leaf per the §2 vocabulary definition. It enters `ready` and dispatches normally as a single-Send sequence (one `create_attack` + one `add_message`). No special-case behavior — the dispatch loop treats it the same as any other leaf. Operators who didn't intend to fire the orphan can delete it before the wave starts; the [02 §5.16 delete-a-branch](02_tree_ui_affordances.md#516-delete-a-branch) affordance applies. + +### 3.3 Dispatch step (leaf SendNode) — partition + create_attack + sequential add_message calls + +Per the §3.2 model, a leaf's dispatch is **one `create_attack` followed by N `add_message` calls in sequence**, where the N add_messages correspond to the stale Sends on the leaf's path (including the leaf itself). The partition rule: + +- **Clean prefix:** Sends on the path that are `clean` (their current params match their existing execution's `resolvedInputHashAtExecution`). Their input UserTurns + their assistant-response pieces go into `prepended_conversation`. No add_message needed — these turns are pre-loaded into the AR's conversation as historical context. +- **Fresh suffix:** the first stale Send on the path and everything after (down to and including the leaf). Each `(input_user_turn, send_node)` pair becomes one sequential `add_message(send=True)` call. Each call fires the target and produces fresh assistant pieces, which become that Send's new `ExecutionRecord.pieceIds`. + +The whole sequence is one AR (cleanly filterable in History by `conversation_tree_id`) and one concurrency slot. + +```python +async def dispatch(leaf_send_node, waveId, waveTriggerKind): + # Hold one concurrency slot for the whole sequence (§10.1): + async with dispatchSemaphore: + path = root_to_node_path(leaf_send_node) + # Partition: returns (prepended_messages, fresh_suffix_pairs). + # - prepended_messages: list[PrependedMessageRequest], one per turn in clean prefix. + # - fresh_suffix: list[(UserTurnNode, fan_variant_or_None, SendNode)] in topo order. + # Each entry includes the fan-variant (axis, slot) the resolver captured if a Fan + # ancestor sits between the UserTurn and this Send; None otherwise. + prepended, fresh_suffix = resolve_path_partition(path) # §4.1 + if len(prepended) > 200: # Backend cap is on prepended_conversation only (max_length=200). + sink.setNodeState(treeId, leaf_send_node.id, 'failed', + opts={'reason': 'clean prefix exceeds 200 turns; branch from a midpoint to continue'}) + # Reconcile transform ancestors so any UserTurn/Fan/Score that were `stale` + # waiting on this leaf settle correctly. With the leaf now `failed`, the + # reconciler's "all descendants clean" check is false for them — they stay stale — + # but the walker itself is idempotent and safe to invoke here. + reconcileTransformStates(treeId, path) + return + + # Mark all stale Sends in fresh_suffix as `running` together (interior + leaf). + # Each leaf's dispatch regenerates its own copy of any shared interior Sends — + # V1.0 has no intra-wave memoization (per §3.2; deferred to V1.1 per §12 Q.6). + for _, _, send_node in fresh_suffix: + sink.setNodeState(treeId, send_node.id, 'running') + + # The post-cap body is wrapped in try/finally so reconcileTransformStates runs + # on every dispatch outcome — success, create_attack failure, or mid-chain + # add_message failure. Without the finally, a mid-chain failure that left some + # Sends `clean` would leave their UserTurn ancestors lingering in `stale` because + # the post-loop reconcile call was never reached (the failure path `return`s early). + # See [§3.3a `reconcileTransformStates`](#33a-helpers-referenced-by-the-dispatch-step) — + # the walker is idempotent and bounded by path length; the per-dispatch invocation + # is cheap regardless of outcome. + try: + # Call #1 — create_attack: setup only, no target call. + # Returns attack_result_id AND conversation_id; we need conversation_id for add_message. + try: + create_resp = await attacksApi.createAttack(CreateAttackRequest( + target_registry_name=path.target, + prepended_conversation=prepended, + labels=_build_labels(path, treeId, waveId, waveTriggerKind), + )) + except ApiError as e: + reason = _format_api_error(e, 'create_attack') # §3.3a — discriminates 4xx vs. 5xx for retry UX + for _, _, send_node in fresh_suffix: + sink.setNodeState(treeId, send_node.id, 'failed', reason=reason) + sink.clearExecution(treeId, send_node.id) + return + + # Calls #2..N+1 — one add_message per (UserTurn, fan_variant, Send) in fresh_suffix. + # Each call fires the target; assistant pieces become that Send's new execution. + # `prior_max_turn_number` tracks the highest turn_number already in the AR so the + # next call's response can be diffed to find new pieces (see §3.3a + # `_extract_new_assistant_pieces`). Backend turn_number is 1-indexed; len(prepended) + # is the count of messages create_attack just persisted, so that's the starting max. + prior_max_turn_number = len(prepended) + for idx, (input_ut, fan_variant, send_node) in enumerate(fresh_suffix): + try: + add_resp = await attacksApi.addMessage(create_resp.attack_result_id, + AddMessageRequest( + role='user', + pieces=_pieces_for_user_turn(input_ut, fan_variant), + send=True, + target_registry_name=path.target, + target_conversation_id=create_resp.conversation_id, + converter_ids=_resolved_converter_ids(input_ut, fan_variant), + labels=_build_labels(path, treeId, waveId, waveTriggerKind), + )) + except ApiError as e: + # Partial-commit: this Send (and any after it in the chain) fail. + # Per [01 §6.4.1], failed Sends have their execution nulled so the + # resolver correctly identifies them as needing fresh dispatch on retry. + reason = _format_api_error(e, 'add_message') + sink.setNodeState(treeId, send_node.id, 'failed', reason=reason) + sink.clearExecution(treeId, send_node.id) + # Sends after this in fresh_suffix were marked `running` at dispatch start; + # flip back to stale and clear their executions too. + for _, _, later_send in fresh_suffix[idx + 1:]: + sink.setNodeState(treeId, later_send.id, 'stale') + sink.clearExecution(treeId, later_send.id) + return + # Record the Send's new ExecutionRecord. AR id is the leaf's AR (shared across + # all Sends on the chain); pieceIds are the fresh assistant pieces from add_resp + # (extracted via turn-number diff per §3.3a `_extract_new_assistant_pieces`). + new_pieces, prior_max_turn_number = _extract_new_assistant_pieces( + add_resp, prior_max_turn_number, + ) + record = build_execution_record( + attack_result_id=create_resp.attack_result_id, + conversation_id=create_resp.conversation_id, + assistant_pieces=new_pieces, + waveId=waveId, + waveTriggerKind=waveTriggerKind, + ) + sink.recordExecution(treeId, send_node.id, record) + sink.setNodeState(treeId, send_node.id, 'clean') + finally: + # Reconcile non-Send transform states regardless of outcome (§3.3a). Correctly + # handles full success (all UserTurn ancestors flip clean), partial success on + # mid-chain failure (UserTurn ancestors of the succeeded prefix flip clean; + # ancestors of the failed/stale suffix stay stale), and create_attack failure + # (no Sends became clean; no ancestors flip). + reconcileTransformStates(treeId, path) +``` + +**Why hold the semaphore for the whole sequence.** The N+1 calls all target the same AR (via `target_conversation_id = create_resp.conversation_id`) and reference state created by earlier calls in the sequence. Releasing the slot between calls would let other leaves race for it while this leaf is waiting on a mid-chain `add_message`, and the runner's per-tree serialization would no longer reflect actual in-flight calls. Holding the slot keeps the budget honest: `maxParallel=4` concurrent leaves = at most 4 active operator-meaningful chains, regardless of chain depth. + +**Partial-commit on mid-chain failure.** If `add_message` #3 of a 5-message sequence fails, the AR exists with the first 2 user turns + assistant responses successfully sent. The failed Send transitions to `failed`; Sends 4 and 5 transition back to `stale` (they were `running` before; the chain stopped before reaching them). The leaf shows `failed` because its add_message was never reached. The runner's `handleCompletion` then runs the §5.3 in-flight cascade: any sibling leaves in `ready` whose path includes the failed Send are dropped to `blocked` so they don't independently retry the same failure. The operator's retry from the toast re-dispatches the whole leaf, which: + +- Creates a brand-new AR (does not reuse the partial AR; see §7.5 below for the "no retry fast-path in V1.0" decision). +- Re-fires all stale Sends on the path. The previously-succeeded Sends in the prior partial dispatch are no longer reachable through this dispatch (their `ExecutionRecord`s point to the previous AR, which still exists in History as a partial row). + +**Field reference (verified against backend, [pyrit/backend/models/attacks.py](../../../pyrit/backend/models/attacks.py)):** + +- `CreateAttackRequest.prepended_conversation: list[PrependedMessageRequest] | None` — max 200 messages. +- `PrependedMessageRequest = { role: ChatMessageRole, pieces: list[MessagePieceRequest] }` — one message per turn; multimodal turns have multiple pieces in one PrependedMessageRequest. +- `AddMessageRequest = { role, pieces, send, target_registry_name, target_conversation_id, converter_ids, labels }` — `target_conversation_id` is **required always**; `target_registry_name` is required when `send=True`. +- `CreateAttackResponse = { attack_result_id, conversation_id, created_at }` — the runner needs both ids; `conversation_id` flows into the second-and-later `add_message` calls. + +**Idempotency.** The runner does not deduplicate. If the operator double-clicks Refresh, **two waves fire, two leaf AR sequences land** (cost ≈ 2× tokens). The §3.3b debounce catches the common case; the cost-guardrail modal (§2.3) catches the above-threshold case. + +### 3.3a Helpers referenced by the dispatch step + +The §3.3 pseudocode uses several helpers that need explicit specs (the implementer cannot guess them from the call sites alone). + +**`_extract_new_assistant_pieces(add_resp, prior_max_turn_number)`** — `AddMessageResponse.messages` is a `ConversationMessagesResponse` (verified against [pyrit/backend/models/attacks.py L153-L157](../../../pyrit/backend/models/attacks.py#L153)) whose `.messages: list[Message]` carries the **entire conversation**, not just the new pieces. Each `Message` has `.turn_number` (1-indexed), `.role`, `.pieces: list[MessagePiece]`. The runner identifies just-added assistant pieces by turn-number diff: before each `add_message` call, hold `prior_max_turn_number` (initialized to `len(prepended_conversation)` after `create_attack` returns, since `turn_number` is 1-indexed); after the call returns, walk `add_resp.messages.messages` and collect pieces from any Message whose `turn_number > prior_max_turn_number` and `role == 'assistant'`. Update `prior_max_turn_number` for the next iteration. + +```python +def _extract_new_assistant_pieces(add_resp, prior_max_turn_number): + new_pieces = [] + new_max = prior_max_turn_number + for msg in add_resp.messages.messages: # AddMessageResponse.messages: ConversationMessagesResponse + if msg.turn_number > prior_max_turn_number and msg.role == 'assistant': + new_pieces.extend(msg.pieces) + new_max = max(new_max, msg.turn_number) + return new_pieces, new_max +``` + +If V1.1 adds a backend `?since_turn=N` filter, this helper collapses to one extend call; the V1.0 walk is O(messages-in-AR) per add_message, which is bounded by the 200-message cap. + +**`_format_api_error(error, call_name)`** — classifies an API error into one of three failure classes for retry UX: `'transient'` (5xx + network/timeout; retry-eligible), `'rate_limited'` (HTTP 429 + provider-specific overloaded errors; retry-eligible but gated until the operator manually re-triggers), `'permanent'` (4xx other than 429: validation, operator-lock mismatch, target-not-found; retry-ineligible without operator action). The wave-complete toast ([02 §2.3](02_tree_ui_affordances.md#23-canvas-level-affordances)) reads `error.failure_class` to decide the [Retry failed] button gating and per-class summary count. + +```python +def _format_api_error(error, call_name): + if error.status_code is None: # network error / timeout + return ApiErrorReason( + message=f"{call_name} failed (network): {error.message} — likely transient, retry", + failure_class='transient', + ) + if error.status_code == 429 or _is_provider_rate_limit_shape(error): + # Provider-specific shapes: Anthropic overloaded_error, OpenAI rate_limit_exceeded, + # Azure-specific. See [Q.G.1](#12-open-questions) for the small detection registry. + return ApiErrorReason( + message=f"{call_name} rate-limited ({error.status_code}): {error.message} — wait for the target's rate-limit window, then retry", + failure_class='rate_limited', + ) + if 500 <= error.status_code < 600: + return ApiErrorReason( + message=f"{call_name} failed ({error.status_code}): {error.message} — transient, retry", + failure_class='transient', + ) + if error.status_code == 400 and 'operator' in (error.message or '').lower(): + return ApiErrorReason( + message=f"{call_name} blocked by operator lock — branch from this node to take ownership", + failure_class='permanent', + ) + return ApiErrorReason( + message=f"{call_name} failed ({error.status_code}): {error.message}", + failure_class='permanent', + ) +``` + +The leaf's stored `lastError` carries both fields. Wave-summary aggregation counts each leaf's terminal `failure_class` into the toast's three-class breakdown (`failed` / `rate_limited` / `permanent`). The [Retry failed] button is enabled when at least one leaf has `failure_class ∈ {'transient', 'rate_limited'}` AND no rate-limited-only state — i.e., button is disabled when *every* failed leaf is `rate_limited` (the operator must wait); enabled when *any* failed leaf is `transient` (button retries only the transient subset; rate-limited leaves stay failed in the toast and a follow-up manual Refresh tree retries them once the operator believes the window has cleared). Tooltip text follows the gating: rate-limited-only → *"All N failed leaves were rate-limited. Wait for the target's rate-limit window to clear, then click Refresh tree to retry."*; mixed → *"Retrying N transient failures; M rate-limited leaves are excluded and remain failed in the wave summary."* V1.x adds `Retry-After` header parsing and a countdown timer (see [§12 Q.7](#12-open-questions)). + +**`_root_prompt_as_user_turn(root_node)`** — promotes a `RootPromptNode` into the shape `_make_user_turn_message` expects. The `text` becomes the user-turn text; the `attachments` become the user-turn attachments. `systemPrompt` does NOT become part of this user turn — it routes separately (see below). + +**`_systemPrompt_as_prepended_message(root_node)`** — `CreateAttackRequest` has no `systemPrompt` field (verified against [pyrit/backend/models/attacks.py L221-L243](../../../pyrit/backend/models/attacks.py#L221)). The backend pattern for system prompts is `PrependedMessageRequest` with `role='system'` as the first prepended message. When `root_node.params.systemPrompt` is non-empty, the resolver prepends a synthetic system message to the `prepended` list: + +```python +def _systemPrompt_as_prepended_message(root_node): + if not root_node.params.systemPrompt: + return None + return PrependedMessageRequest( + role='system', + pieces=[MessagePieceRequest( + role='system', + original_value=root_node.params.systemPrompt, + converted_value=root_node.params.systemPrompt, + original_value_data_type='text', + converted_value_data_type='text', + )], + ) +``` + +The system message is always at sequence 0 (first in `prepended_conversation`). Counts against the 200-message cap. If absent, the AR has no system message — same as today's chat tab default. + +**`reconcileTransformStates(treeId, path)`** — non-Send nodes (UserTurn, Fan, Score) are marked `stale`/`edited` by the [01 §6.3 propagation rules](01_tree_primitives.md#63-propagation-rules) but the runner's dispatch loop only transitions Send-state. After each successful Send completion, the runner walks back up the path and flips any `stale` UserTurn / Fan / Score whose ancestors are now all `clean` back to `clean`. Without this, the canvas shows lingering yellow borders on transform nodes after a fully-successful refresh. + +```python +def reconcileTransformStates(treeId, path): + """Walk ancestors of just-completed Sends; flip transforms to clean when ancestors are clean.""" + for node in path: + if isinstance(node, (UserTurnNode, FanNode, ScoreNode)): + if node.state == 'stale' and all(p.state == 'clean' for p in node.parents): + sink.setNodeState(treeId, node.id, 'clean') +``` + +Called after each `sink.recordExecution + setNodeState(clean)` on a Send in the §3.3 dispatch loop. Idempotent: a node already `clean` is unchanged. + +**`reconcileAllTransforms(treeId)`** — the wave-end sibling helper. Same per-node rule as `reconcileTransformStates`, but iterates **every** node in the tree (not just the path). Called once at §3.1 step 6 prologue, after the dispatch loop settles and before `emitWaveEvent({ kind: 'complete' })`. Catches transforms (especially ScoreNodes) attached as siblings of Sends rather than on a dispatched leaf's path — the operator-typical ScoreNode placement that the path-scoped `reconcileTransformStates` cannot reach. + +```python +def reconcileAllTransforms(treeId): + """Walk every transform node in the tree once; flip stale→clean where ancestors are clean.""" + tree = workspace.currentTree + for node in tree.nodes: + if isinstance(node, (UserTurnNode, FanNode, ScoreNode)): + if node.state == 'stale' and all(p.state == 'clean' for p in node.parents): + sink.setNodeState(treeId, node.id, 'clean') +``` + +Idempotent and cheap (O(tree-size) once per wave); the per-dispatch calls remain in place so canvas state catches up incrementally as leaves settle, and the wave-end pass ensures sibling transforms reconcile too. + +**`_pieces_for_user_turn(user_turn, fan_variant)` and `_resolved_converter_ids(user_turn, fan_variant)`** — straightforward: the former builds the `MessagePieceRequest` list (attachments + text) for the user turn, applying any `converter` fan-axis variant payload that overrides the in-path UserTurn's params; the latter resolves the converter pipeline (the UserTurn's `converterPipeline` plus any fan-variant converter list) into the `converter_ids` list the backend's converter machinery expects. + +**`_build_labels(path, treeId, waveId, waveTriggerKind) → Record`** — builds the labels dict that gets sent on every `CreateAttackRequest` and `AddMessageRequest` in the leaf's dispatch sequence. All keys are present in every wave's calls per the [§4.3 piece-label divergence invariant](#43-label-writes-the-round-trip-fidelity-contract). Conditional fields are omitted (not `null` or empty-string) when not applicable so the backend's `_resolve_labels` ([attack_service.py:L716](../../../pyrit/backend/services/attack_service.py#L716)) doesn't fall back to existing-piece labels for a key that should remain unset. + +```python +def _build_labels(path, treeId, waveId, waveTriggerKind) -> dict[str, str]: + """Returns the labels dict for every CreateAttackRequest and AddMessageRequest + in a leaf's dispatch sequence (§4.3 invariant: identical across all calls).""" + tree = path.tree # the ConversationTree the leaf lives in + operator = currentOperator() + assert operator is not None and operator != '', ( + "tag-hygiene gate bypassed: _build_labels reached with no operator. " + "The §2.1 entry-point shim step 1 must abort the wave with WaveEvent " + "'operator_tag_required' before dispatch reaches here. See 'Missing operator " + "tag handling' below for the contract." + ) + labels = { + 'operator': operator, + 'operation': tree.operation or '', # operator-selected at tree creation; '' if not set + 'conversation_tree_id': str(treeId), + 'wave_id': waveId, + 'wave_trigger_kind': waveTriggerKind, + 'tree_path': json.dumps(path.tree_path_segments), # always present; '[]' for fan-less leaves + } + # parent_conversation_tree_id: only on cloned trees (set by branchToNewTree, [01 §6.5]). + # OMITTED for fresh trees (newTree, openTree from History without a parent). The + # auto-reverse path reads this key and treats absence as "no parent" — safer than + # writing the empty string, which History "Open clones of" would surface as a row + # claiming the tree is its own parent. + if tree.parentConversationTreeId is not None: + labels['parent_conversation_tree_id'] = str(tree.parentConversationTreeId) + return labels +``` + +**Missing operator tag handling (tag-hygiene gate).** `operator` is a tag the operator picks for their work — not an auth claim. The tag is what powers History filtering ("show me all my work"), per-operator `_validate_operator_match` isolation on the backend (operator-Y can't `add_message` against operator-X's tagged ARs), and the §15 audit log's work-attribution column. Under normal operation, the [§2.1 entry-point shim step 1](#entry-point-shim-ordering-v10) prevents any wave from dispatching when `currentOperator()` returns null/empty — `_build_labels` is never invoked in the missing-tag state, so no `operator: ''` AR is ever created. The UI surfaces a per-action modal (the runner's `WaveEvent { kind: 'operator_tag_required' }` triggers it, see [02 §2.3](02_tree_ui_affordances.md#23-canvas-level-affordances)) so the operator sets a tag and re-triggers; the wave-start gate fires once at the canvas-level click moment, not per-leaf. + +**Hard assertion at dispatch time — no defense-in-depth fallback.** `_build_labels` includes `assert operator is not None and operator != ''` at its entry. If the shim's tag-hygiene gate is somehow bypassed (test fixture that mocks the gate, future runner refactor that misses the gate, mid-wave tag-cleared race), the assertion fires and the dispatch panics rather than silently writing `operator: ''` ARs. Reviewer rev-16 caught that an earlier defense-in-depth path that wrote `operator: ''` was **broken under the previously-spec'd [§9.4.5 backend tightening](01_tree_primitives.md#945-hard-backend-dependency-relocate-_validate_operator_match)** (since-reverted per Q.S.2 rev 18 — see that section's body): the tightened `_validate_operator_match` would have raised an operator-mismatch error against requests with an empty operator label, so the supposed defense-in-depth ARs would always 400 at the first `add_message`. Even with Q.S.2 reverting the tightening (the no-labels early-return is preserved, so empty operator now passes through), the assert-and-panic path is still the right choice because (a) silently writing `operator: ''` ARs is operator-hostile regardless of backend response — the audit trail loses authorship; (b) the asymmetry of "which backend version is deployed" was itself a hazard. Rev-16 chose the assert-and-panic path: the gate IS the contract; defense-in-depth-by-empty-string was a non-functional rationalization. The earlier "empty-string is grep-able in History" argument also failed under the tightening since those records never get created past the first message. + +**`tree_path` segments are computed once per dispatch.** `path.tree_path_segments` is `list[tuple[str, int]]` — the (axis, slotIndex) tuples for every `FanNode` ancestor on the leaf's root-to-leaf path, in topo order. Computed from the path itself (no separate state); JSON-encoded inside `_build_labels`. Empty array for leaves with no fan ancestors; encoded as `'[]'` (the parser per [§4.3 tree_path encoding](#tree_path-encoding-v10-json-to-keep-forward-compatible) accepts both `'[]'` and absence). + +**Piece-fetch caching for `_load_piece_as_request(pid)`** in `_load_send_response_as_message` (§4.1). The backend exposes **no piece-by-id endpoint** ([routes/attacks.py](../../../pyrit/backend/routes/attacks.py) lists only conversation-level reads); the only read path for piece data is `GET /api/attacks/{attack_result_id}/messages` which returns every piece for one AR's conversation. The cache is populated **at wave-start** by a pre-fetch pass: the runner walks each leaf's clean-prefix Sends, collects the distinct source-AR ids referenced by those Sends' `execution.attackResultId` fields, and issues **one `GET /messages` per distinct AR** (not one per piece). Each response's pieces all land in `pieceCache` keyed by `piece.id`; `_load_piece_as_request(pid)` then resolves from the cache without per-piece HTTP. For a 60-leaf wave with 10-deep clean prefixes referencing 5 distinct source ARs, the pre-fetch issues 5 HTTPs, populates ~300 pieces, and avoids the ~600 per-piece round-trips the cache name initially suggested. Cache lifetime is one wave (cleared on wave-complete) to keep memory bounded; cross-wave reuse is not attempted because intervening Refresh activity may have invalidated piece content. *Backend note:* a future `GET /api/pieces/{id}` endpoint would let the cache become lazy (fetch-on-miss) instead of pre-fetch, but isn't needed for V1.0 — conversation-level reads are cheap and already paid for in the auto-reverse path (§9.3). + +### 3.3b Debounce on `refreshTree` / `refreshSubtree` + +V1.0 firm: the refresh button handler debounces user clicks at **250 ms** before dispatching. The debounce is in the UI button handler, not in the runner — the runner's API is intentionally fire-and-trust. Double-clicking the button within 250 ms collapses to one runner invocation. + +**Single debounce module across UI surfaces.** The debounce module lives at `frontend/src/ui/refreshHandlers.ts` and exposes one hook `useDebouncedRefresh()` plus one global event emitter `refreshBus` (a singleton `EventTarget`). The wiring: + +- **Ribbon button** (`` component): calls `useDebouncedRefresh().refreshTree(treeId)` on click. Hook-internal `setTimeout` enforces the 250 ms window. +- **Right-click "Refresh subtree"** (in the [react-flow context menu](https://reactflow.dev/api-reference/components/context-menu)): calls the same hook via the menu item's `onClick`. +- **`R` keyboard shortcut** (registered in ``'s `onKeyDown`): dispatches `refreshBus.dispatchEvent(new CustomEvent('refresh_subtree_request', { detail: { treeId, nodeId } }))`; the hook listens to `refreshBus` and routes through the same debounce. +- **Cross-surface coalescing:** the hook stores `lastFireAtByTree: Map`; any call within 250 ms of the previous fire (regardless of surface) is dropped. The bus pattern is just to avoid prop-drilling the hook into every component. + +The `frontend/src/runner/runner.ts` module does NOT depend on the debounce module — the runner is invoked by the hook, not the other way around. This keeps the runner test surface clean of UI concerns. + +**Operator override:** shift-click or Cmd-click bypasses the debounce and fires a second wave immediately, for operators who actually want N waves back-to-back. The escape hatch keeps the debounce from blocking power users. + +**Why this matters.** A 60-leaf refresh whose second wave fires from a double-click = 120 AR sequences = $$$ at typical model prices. The cost-guardrail modal (default `confirmThresholdCount = 20`) only intercepts the *first* click in a double-click; the second click already cleared the modal and fires unmodaled. Debouncing in the UI is the only reliable defense. + +## 4. Per-leaf AR Materialization + +### 4.1 The resolved root-to-leaf path → (prepended, final user turn) + +For a leaf `SendNode` L, walk parents to the root and partition the path's Sends into a **clean prefix** (Sends whose current params still match their executions — their input UserTurns and stored assistant pieces can be loaded into `prepended_conversation` as historical context) and a **fresh suffix** (the first stale Send and everything after, down to the leaf — each (input UserTurn, Send) pair becomes one sequential `add_message` call per §3.3). + +This partition is the central trick that makes Option A work: an N-deep stale chain becomes one AR with `prepended_conversation` covering everything above the first stale Send, plus N sequential `add_message` calls to regenerate the stale Sends in topo order. The leaf and all its interior-Send ancestors share one AR; History stays clean. + +#### V1.0 implementation reality — no clean-prefix optimization + +The design above describes the eventual V1 behavior. **V1.0 actually ships the dumb-but-correct variant: every Send on the path enters `freshSuffix`; the clean-prefix branch is disabled.** Loading clean-prefix Sends into `prepended_conversation` requires fetching their stored assistant pieces by `piece_id`; the backend has no `GET /api/pieces/{id}` route, so the runner needs a per-wave cache populated from `GET /attacks/{id}/messages` of the source ARs. That cache (the `_load_piece_as_request` helper spec'd in §3.3a) is **not in V1.0**. + +Two paths out of the missing-cache state were considered: + +1. **Build the cache in V1.0** — ~150 LOC across a new module + integration with wave-start, plus tests. Closes the design's contract; preserves all the cost arguments below. +2. **Force every Send into `freshSuffix`** — ~10 LOC change in the resolver. Operators pay the full re-dispatch cost on every wave; correctness is restored because every assistant message the target sees was actually generated by the target. + +V1.0 ships option 2. The trade-off: an operator who edits only a leaf at the bottom of a 10-deep clean chain pays 11 calls (1 `create_attack` + 11 `add_message`s — the system message at most goes into `prepended_conversation`, then every Send re-fires from the root) instead of the 2 calls the design's clean-prefix model would have produced. ~5× cost regression for the "edit-leaf-only" hot path. Acknowledged in [01 §1.2](01_tree_primitives.md#12-v10-known-limitations-sharp-edges-in-what-v10-does-ship). + +The reason for option 2: silently writing fabricated assistant context (placeholder strings, or omitting the `original_value` field) into the model's history produces target responses conditioned on nonexistent prior turns. The model would either reject (validation), or worse, accept and reason against fabricated history — neither is acceptable for a red-teaming tool. Re-firing the chain is honest. + +**V1.x ships the piece cache** and restores the clean-prefix optimization. The migration is incremental: the resolver gains back the `clean prefix vs fresh suffix` split, the dispatcher reads from the cache at piece-construction time, and operator cost drops back to the per-leaf-edit minimum. The wire shape of `CreateAttackRequest.prepended_conversation` does not change; only the runner's population logic does. + +The pseudocode below documents the eventual V1 model. The V1.0 implementation is the same code with the `clean_prefix` branch removed — `seen_first_stale` is always `True` from the first Send, and every Send enters `fresh_suffix`. + +```python +def resolve_path_partition(path): + """Returns (prepended, fresh_suffix). + + - prepended: list[PrependedMessageRequest], one entry per turn in the clean prefix. + Multimodal turns (e.g. user text + image) become ONE PrependedMessageRequest with + multiple pieces (max 50 per the backend model). The backend caps prepended length + at 200 messages. + - fresh_suffix: list[(UserTurnNode, fan_variant_or_None, SendNode)] in topo order, + each entry becoming one add_message(send=True) call. The last element is always + (leaf_input_user_turn, leaf_fan_variant_or_None, leaf). + + V1.0 has no intra-wave shared-piece cache (per §3.2 V1.0-decision; deferred to V1.1 + per §12 Q.6). Each leaf's dispatch independently regenerates every stale Send on + its path — if multiple leaves share a stale interior Send, the target is called + once per leaf for that Send. + + The path is `[Source, UserTurn, Send, UserTurn, Fan, Send, ...]` (per [01 §5.1 invariant 5](01_tree_primitives.md#51-invariants) — a Send's *first non-Fan, non-Score ancestor* on the path is always a UserTurn with `role='user'` or a RootPromptNode). FanNode and ScoreNode pass through transparently; the resolver holds `pending_user_turn` across Fan/Score boundaries so a Send inside a Fan(attempt) picks up the Fan's parent UserTurn (with fan-variant override applied at piece-construction time). + """ + prepended = [] + fresh_suffix = [] + pending_user_turn = None # UserTurn waiting to be paired with the next Send (held across Fan/Score) + pending_fan_variant = None # axis+slot for the most recent Fan ancestor; resets when we exit the Fan + seen_first_stale = False + + for node in path: + if isinstance(node, RootPromptNode): + # Root prompt is the first user-role turn; treat its text as a UserTurn input + # for the first Send. systemPrompt (if any) routes through PrependedMessageRequest + # with role='system' as the FIRST prepended message — there is no systemPrompt + # field on CreateAttackRequest (verified against backend models/attacks.py). + # See §3.3a `_systemPrompt_as_prepended_message` for the helper spec. + sys_msg = _systemPrompt_as_prepended_message(node) + if sys_msg is not None: + prepended.append(sys_msg) + pending_user_turn = _root_prompt_as_user_turn(node) + pending_fan_variant = None + elif isinstance(node, UserTurnNode): + # Hold this UserTurn until we see its downstream Send. Reset the fan-variant + # cursor — a new UserTurn means we're past any fan whose variant applied to + # a previous UserTurn. + pending_user_turn = node + pending_fan_variant = None + elif isinstance(node, SendNode): + assert pending_user_turn is not None, ( + "tree-shape invariant ([01 §5.1] #5): every Send has a UserTurn/Root " + "ancestor on the path (Fan/Score may sit between them transparently)" + ) + # Per §3.1, S = {edited, stale, failed, cancelled}. The state check covers all + # four explicitly; the `execution is None` clause is the safety net for + # failed/cancelled (per [01 §6.4.1] they have execution=null) and for the + # rare case of a leaf with no prior execution at all (freshly-added Send + # that's never been refreshed). + is_stale = (node.state in {'edited', 'stale', 'failed', 'cancelled'}) or (node.execution is None) + if not seen_first_stale and not is_stale: + # Still in the clean prefix: load this turn's input + assistant response from storage. + prepended.append(_make_user_turn_message(pending_user_turn, pending_fan_variant)) + prepended.append(_load_send_response_as_message(node)) # role='assistant', multimodal ok + else: + seen_first_stale = True + # Fresh suffix: this pair will fire via add_message in §3.3. The variant + # is carried alongside the UserTurn so add_message gets the right converter_ids + # and piece content. + fresh_suffix.append((pending_user_turn, pending_fan_variant, node)) + # The Send "consumes" the pending UserTurn — next iteration needs a fresh one + # (typically supplied by the next UserTurn or RootPromptNode in the path). + pending_user_turn = None + pending_fan_variant = None + elif isinstance(node, FanNode): + # Structural pass-through. Capture which (axis, slot) we're descending into so + # the resolver can apply the variant payload to the downstream Send's content. + # The path's downstream node carries the chosen child's slot index in its + # edge.slotIndex; the resolver reads it here. pending_user_turn is held across + # the Fan (NOT cleared) so a Fan(attempt) directly above a Send works correctly: + # the Send's input is the UserTurn ABOVE the Fan, varied by the fan's variant. + pending_fan_variant = (node.params.axis, path.edge_slot_for(node)) + elif isinstance(node, ScoreNode): + # Observational pass-through; no piece contribution. Holds pending_user_turn + # and pending_fan_variant unchanged. + pass + + # Sanity: the leaf must always be the last element of fresh_suffix; if a leaf + # path ends with everything clean, the leaf itself must be in fresh_suffix because + # the operator wouldn't have triggered a dispatch on a clean node. + assert fresh_suffix and fresh_suffix[-1][2].id == path[-1].id, \ + "fresh_suffix invariant: ends at the leaf Send" + + return (prepended, fresh_suffix) + + +def _make_user_turn_message(user_turn_or_root) -> PrependedMessageRequest: + """Build a PrependedMessageRequest from a UserTurnNode or RootPromptNode-as-user-turn. + Multimodal pieces (text + attachments) are bundled into one message.""" + return PrependedMessageRequest( + role=user_turn_or_root.role, # 'user' | 'system' | 'simulated_assistant' + pieces=[_piece_from_attachment(a) for a in user_turn_or_root.attachments] + + [_piece_for_text(user_turn_or_root.text, user_turn_or_root.converter_pipeline)], + ) + + +def _load_send_response_as_message(send_node) -> PrependedMessageRequest: + """Load the assistant pieces from a clean Send's prior execution into ONE message. + + Each piece carries forward its original_prompt_id so lineage chains stay intact + across re-prepends. The §9.4.4 (b) DTO extension exposes this field on + BackendMessagePiece; `_load_piece_as_request` reads it and writes it onto the + new MessagePieceRequest. The backend's MessagePieceRequest accepts + original_prompt_id as an optional field; absent → fresh lineage root. + """ + assert send_node.execution is not None, "clean Send must have an execution" + return PrependedMessageRequest( + role='assistant', + pieces=[_load_piece_as_request(pid) for pid in send_node.execution.pieceIds], + ) + + +def _load_piece_as_request(piece_id) -> MessagePieceRequest: + """Fetch the BackendMessagePiece (cached per-wave, §3.3a) and copy its fields + into a MessagePieceRequest, preserving original_prompt_id for lineage.""" + piece = pieceCache.get(piece_id) # cached for the duration of the current wave + return MessagePieceRequest( + data_type=piece.original_value_data_type, + original_value=piece.original_value or '', + converted_value=piece.converted_value, + mime_type=piece.original_value_mime_type, + original_prompt_id=piece.original_prompt_id, # PRESERVE lineage (§9.4.4 b dep) + prompt_metadata=piece.prompt_metadata, + ) +``` + +**Why partition.** Sends whose params haven't changed since they last executed have valid stored pieces — re-firing them is wasteful and yields different responses (target nondeterminism). Sends whose params changed need to re-fire to get a response that matches the new input. The partition is the natural boundary between the two. + +**Why interior Sends in the fresh suffix don't need their old `execution.pieceIds`.** They're about to be regenerated. Their old pieces become stale `ExecutionRecord` entries in `executionHistory` (per §6.6) — operators can checkout-detached to inspect, but the runner doesn't reference them in the new dispatch. + +**Why interior Sends in the clean prefix DO need their old `execution.pieceIds`.** They're not being regenerated, so the target needs to see their prior assistant responses as historical context in `prepended_conversation`. + +**Leaf-only path with all-clean upstream.** Say the operator just hit `↻` on a leaf (the leaf itself is `edited` because they tweaked its input UserTurn, but everything upstream is `clean`). The partition produces: +- `prepended` = [Root user turn, Send1 assistant, UserTurn2, Send2 assistant, …, leaf's-parent-UserTurn's-prior-version, leaf's-prior-Send-assistant-if-it-existed] +- `fresh_suffix` = [(leaf_input_user_turn_new_params, leaf)] + +Wait — the partition rule above marks the leaf as stale iff `node.state in {'stale', 'edited'} or node.execution is None`. A leaf the operator just tweaked has the *node above it* (the UserTurn) edited; the leaf Send itself is `stale` (per §6.3 rule 1) because its ancestor changed. So the leaf is in fresh_suffix. ✓ + +**Fan axis variant resolution (V1.0 axes).** When `path` traverses a `FanNode`, the path itself selects which child UserTurn is visited; the variant payload is resolved at piece-construction time inside `_make_user_turn_message`: + +- `axis='attempt'`: variant payload is empty `{}`; all attempts share identical `prepended` + identical `fresh_suffix` pieces (the AR id and creation timestamp differ). +- `axis='converter'`: the fan child's `converters: ConverterRef[]` is appended to the input UserTurn's `converter_pipeline` before piece construction. The `converted_value` differs per leaf. The runner also passes `converter_ids` on the corresponding `add_message` so the backend's converter machinery is engaged — without this, the converter axis does nothing at runtime. (V1.0 carries this in `AddMessageRequest.converter_ids` per the §3.3 dispatch code.) + +V1.1 axes (`prompt`, `target`, `system_prompt`, `temperature`) plug into the same resolver — the variant payload overrides a specific field on the in-path node (per [01 §4.4 FanVariant types](01_tree_primitives.md#44-structural-nodes--the-single-fan-out-primitive)). + +### 4.2 The 200-message cap + +`CreateAttackRequest.prepended_conversation` is capped at 200 messages by the backend model ([attacks.py L221-L243](../../../pyrit/backend/models/attacks.py#L221)). The cap is on `PrependedMessageRequest` count (messages, not pieces — a multimodal turn with 3 pieces is one message). **The cap applies only to `prepended_conversation`**; the backend does not cap conversation length grown via subsequent `add_message` calls. + +**The runner checks `len(prepended) > 200`** before dispatching. If over, the runner short-circuits before `create_attack` and the leaf transitions to `failed` with reason `"clean prefix exceeds 200 turns; branch from a midpoint to continue"`. The post-dispatch `add_message` sequence adds 2×N messages (one user + one assistant per Send in fresh_suffix) to the conversation but those don't count against this cap — they extend the AR's conversation past 200 messages cleanly. *Earlier revisions used `len(prepended) + len(fresh_suffix)` as a conservative estimate; this rejected valid dispatches whose `prepended` was under 200 but whose total post-`add_message` length exceeded it, even though the backend would have accepted them.* + +Under AR-per-leaf the cap is **per-root-to-leaf-path's clean prefix** — a tree with 1000 leaves at 10 turns deep is fine; only a leaf whose *clean prefix alone* exceeds 200 turns trips the cap. Operationally this is unreachable until a tree has accumulated 200+ clean Sends on a single chain, which is several waves' worth of refresh on a Crescendo-style depth-extending attack. + +**V1.0 recovery path:** + +- **Soft warning at 180 turns of clean prefix** in the canvas-level ribbon ([02 §2.3](02_tree_ui_affordances.md#23-canvas-level-affordances)): *"This conversation is approaching the 200-turn prepended ceiling. Use Branch from a midpoint to keep extending."* +- **Hard refusal at 200 clean-prefix turns**: leaf goes `failed`; tooltip points at `📋` (`branchToNewTree`, V1.0-shipped per [01 §6.5](01_tree_primitives.md#65-branch-from-node---the-immutable-history-primitive)) as the recovery primitive. Operator picks a midpoint node, clicks `📋`, edits the midpoint's text to summarize the truncated prefix, and continues from there. +- The recovery is operator-driven; the runner does not auto-truncate (would silently change the conversation context the target sees). + +### 4.3 Label writes (the round-trip-fidelity contract) + +Every dispatched AR carries: + +| Label | Source | Version | Why | +|---|---|---|---| +| `operator` | Current user (per [01 §9.1](01_tree_primitives.md#91-operator-isolation-posture)) | V1.0 | Operator-isolation check; the V1.0 PR set carries the [§7.4 / §9.4.5](01_tree_primitives.md#945-hard-backend-dependency-relocate-_validate_operator_match) relocation so the server-side check survives `removed_in="0.16.0"` piece-label deprecation | +| `operation` | Operator-selected (existing chat flow) | V1.0 | History grouping | +| `conversation_tree_id` | `tree.id` | V1.0 | Groups all leaves from one tree (per [01 §2 Vocabulary](01_tree_primitives.md#2-vocabulary)) | +| `wave_id` | `waveId` (generated in §3.1) | V1.0 | Groups leaves from one operator action | +| `wave_trigger_kind` | One of [01 §14.1 enum](01_tree_primitives.md#141-the-data-model-addition) | V1.0 | `refresh_node` / `refresh_subtree` / `refresh_tree` / `retry_failed` (V1.0); `synced_peer_add` (V1.1); `cross_tree_rebase` (V2.1+) | +| `parent_conversation_tree_id` | Set by `branchToNewTree` on cloned trees (the source tree's id) | **V1.0** (per Patch #1) | History "where did I fork this from" navigation per [02 §7 A.1](02_tree_ui_affordances.md#7-decisions-and-open-questions); ships V1.0 because `branchToNewTree` ships V1.0 | +| `tree_path` | JSON-encoded array of `[axis, slotIndex]` pairs from root to leaf — see encoding below | **V1.0** (required) | Lets V1.1 fanout-detection reconstruct **nested fan structure** for V1.0+ trees without relying on `original_prompt_id` chain flattening (which loses nesting per [01 §9.3.1 caveat](01_tree_primitives.md#931-fan-grouping-algorithm-v11--original_prompt_id-chain-flattening--wave_id-disambiguator)). | + +These labels are the entire round-trip-fidelity story for V1.0 — the auto-reverse logic ([01 §9.3](01_tree_primitives.md#93-migration-of-existing-linear-attacks---auto-reverse-to-a-tree)) and the [§9.4.1 reload-reconstruction path](01_tree_primitives.md#941-reload-reconstruction-v10) read them back to reconstruct the tree. + +**Piece-label divergence invariant.** Within one leaf's dispatch sequence, every piece created by `create_attack` (the prepended messages) and every piece created by the N `add_message` calls carries the **same** label set: `operator`, `operation`, `conversation_tree_id`, `wave_id`, `wave_trigger_kind`, `parent_conversation_tree_id`, `tree_path`. The runner does not vary labels across the sequence's calls. This matters because the backend's [`_resolve_labels` at attack_service.py:L708](../../../pyrit/backend/services/attack_service.py#L708) prefers existing piece labels over request labels — if the runner accidentally diverged labels mid-sequence, later add_messages would silently inherit earlier pieces' labels. The invariant holds by construction (one `_build_labels(path, treeId, waveId, waveTriggerKind)` call passed identically to every request in the sequence), and is asserted by [§11.1 labels-divergence test](#111-unit-testable-in-isolation-no-backend) (client-side) AND [§11.2 labels round-trip test](#112-needs-the-backend-integration-tests) (catches backend `_resolve_labels` regressions — the silent-corruption class that the [§9.4.5](01_tree_primitives.md#945-hard-backend-dependency-relocate-_validate_operator_match) PR set anticipates). + +#### `tree_path` encoding (V1.0, JSON to keep forward-compatible) + +Earlier rev 10 used `/` segments joined by `,`. Rejected per reviewer rev 10 (C6): if any future fan axis name contains `/` or `,`, decoding breaks silently. V1.0 ships **JSON array of `[axis, slotIndex]` tuples**: + +``` +labels.tree_path = '[["prompt",2],["attempt",3]]' # nested: outer prompt fan, inner attempt fan +labels.tree_path = '[]' # leaf with no fan ancestors (empty array, not omitted) +labels.tree_path = '[["attempt",7]]' # single fan ancestor +``` + +**Parser contract:** + +```ts +function parseTreePath(label: string | undefined): Array<[string, number]> { + if (label === undefined || label === '') return [] + try { + const parsed = JSON.parse(label) + if (!Array.isArray(parsed)) throw new Error('not array') + return parsed.map(([axis, slot]) => { + if (typeof axis !== 'string' || typeof slot !== 'number') throw new Error('bad shape') + return [axis, slot] + }) + } catch (e) { + console.warn(`malformed tree_path label "${label}":`, e) + return [] // fail-soft: treat leaf as having no fan ancestors + } +} +``` + +**Forward compatibility:** if a future runner version writes a new `tree_path` format (e.g., embedding fan node IDs), older clients see malformed JSON → empty path → fall back to lineage-flattening for those leaves. No hard crash. + +**Why drop the V1.0 `fan_axis` label.** Earlier rev 10 carried a separate `fan_axis` label (the immediate fan ancestor's axis) as a History-tab filtering convenience. Reviewer rev 10 (C7) flagged it as redundant data inviting drift. V1.0 drops it; History-tab filtering by "this leaf's immediate fan axis" derives from the last element of `parseTreePath(tree_path)` — one string-split-equivalent per row, irrelevant cost. + +## 5. State Machine + +The states and transitions are specified in [01 §6.1-§6.2](01_tree_primitives.md#61-states); this section names the runner's contract with the state machine, not the state machine itself. + +### 5.1 The runner only owns three transitions + +| From | To | Trigger | +|---|---|---| +| `stale` ∨ `edited` | `running` | Dispatch start | +| `running` | `clean` | Dispatch success | +| `running` | `failed` | Dispatch error | + +All other transitions (`clean` ↔ `edited` via operator edit, `clean` → `stale` via ancestor change, `running` → `cancelled` via wave abort) are owned by the React state container based on operator actions. The runner reads the state to decide eligibility; it does not write it except for its three transitions. + +### 5.2 Cascade-on-success + +When a `running → clean` transition fires: + +1. Sink records the ExecutionRecord. +2. Sink moves the node to `clean`. +3. The dispatch loop re-evaluates: for each `stale` child of this node, if *all* its parents are now `clean`, add it to `ready`. (Most fan children become ready simultaneously when their fan-parent goes clean; the next iteration of the loop will pick up to `maxParallel - inflight.size` of them.) + +### 5.3 Cascade-on-failure + +When a `running → failed` transition fires: + +1. Sink moves the node to `failed`. Its `node.execution` is nulled and `node.lastError` carries the reason ([01 §6.4.1](01_tree_primitives.md#641-why-nodeexecution--null-on-failure-not-preserved)). +2. **In-flight cascade.** The runner iterates `ready` and drops every leaf whose root-to-leaf path includes the just-failed Send. Dropped leaves transition to `stale` via `sink.setNodeState(treeId, leaf.id, 'stale', opts={'reason': { message: 'blocked by ancestor failure in wave ', failure_class: 'blocked' }})` — the structured reason populates the leaf's `lastError` with `failure_class='blocked'` so the wave-summary's `blocked` count ([§6 WaveEvent](#6-wave-bookkeeping)) can be computed by a single scan of terminal-state leaves' `lastError.failure_class` fields. The wave-summary counts them as **`blocked`** (not as `failed.*` — they never dispatched; the failure was the ancestor's). The dispatch loop's next iteration sees the reduced `ready` set and proceeds with the remaining leaves. +3. **Operator surface.** The [02 §2.3](02_tree_ui_affordances.md#23-canvas-level-affordances) wave-status banner renders the four-value summary `(N ✓, N ⚠ failed, N ⦾ blocked, N ○ cancelled)` during the wave and on the wave-complete toast. Hovering a blocked node's `⦾` chip shows *"Blocked by ancestor `` failure in this wave. [Retry failed] to attempt recovery."* +4. **Recovery is a separate operator gesture.** The [Retry failed] toast button (per [02 §5.14](02_tree_ui_affordances.md#514-partial-failure-mid-refresh)) calls [`runner.retryFailedNodes(treeId, nodeIds)`](#21-entry-points-the-public-api) with the wave-complete-captured `nodeIds` = the union of this wave's failed leaves (any `failure_class` except `permanent`) plus its `blocked` leaves. The runner builds `S` as `{nodeIds} ∪ {failed/cancelled Send ancestors on each nodeId's path}` — scoped to just wave-W's victims, not the whole tree. The new wave's [§3.1 step 2b pre-readiness demotion](#31-topological-walk) flips every `S`-member node currently in `failed`/`cancelled` back to `stale` *before* the readiness rule runs. After demotion, the ancestor allowlist admits them (their state is now `stale`, in the allowlist; no longer `failed`/`cancelled`, in the exclusion set), the leaves below them satisfy readiness, and dispatch proceeds. The interior failed Sends are regenerated as part of each descendant leaf's fresh suffix per [§3.2](#32-what-gets-dispatched). Repeated 5xx (`failure_class='transient'`) on the same Send cascades the same way: each Retry-failed wave is a fresh attempt with no exponential backoff in V1.0. **Rate-limit failures** (`failure_class='rate_limited'` per [§3.3a `_format_api_error`](#33a-helpers-referenced-by-the-dispatch-step)) are surfaced distinctly in the wave-complete toast and excluded from `nodeIds` (the [Retry failed] button is disabled when *all* failed leaves are rate-limited, OR retries only the non-rate-limited subset; rate-limited leaves stay failed in the wave summary until the operator manually clicks Refresh tree after the rate-limit window clears). V1.x adds `Retry-After` header parsing + countdown timer + auto-enable (see [§12 Q.7](#12-open-questions)). + +**Why the toast captures `nodeIds` (not just `treeId`).** Reviewer rev-16 spotted that exposing only `refreshNode`/`refreshSubtree`/`refreshTree` meant `[Retry failed]` had no API to call — it would either fall back to `refreshTree(treeId)` (which sweeps unrelated edits the operator made between waves) or invent ad-hoc scope. The toast captures wave-W's failed+blocked leaf ids at wave-complete time and passes them to `retryFailedNodes`; the runner derives ancestors itself. This scope is stable even if the operator edits the tree between wave-W completion and the retry click — the retry only touches W's victims. + +**Why a pre-readiness demotion and not a weakened readiness rule.** Reviewer rev-15 spotted that the previous §5.3 wording ("the new wave's readiness rule sees the failed-ancestor nodes IN ITS S so descendants can dispatch through them") was false against §3.1 as written — the rule inspects `node.state`, which is still `failed` regardless of which wave is computing. Two fixes were on the table: (a) demote at wave-start, gated on `waveTriggerKind='retry_failed'`; (b) weaken the readiness allowlist to "in S or clean" globally. Option (a) shipped because (b) would revert the anti-amplification fix that's the whole point of §5.3 — sibling leaves of a transiently-failed shared ancestor would each retry the ancestor independently, bringing back the `min(maxParallel, sibling_count)` retry-storm on rate-limited targets. The demotion is operator-invisible (the [Retry failed] click already implies "give up on the previous failure, try again"); the destructive `lastError` clobber is the price. + +**Why cascade in-flight instead of letting sibling leaves retry the shared ancestor.** Under the V1.0 no-coordination, no-backpressure model, sibling leaves sharing a transiently-failed Send X would each independently include X in `fresh_suffix` and retry it. With `maxParallel=4` and a 60-leaf fan against a rate-limited target, the first 429 on X cascades to ~48 more 429s on the same X as siblings dispatch. The in-flight cascade collapses this to one X-failure plus N blocked leaves; the operator's [Retry failed] click then surfaces the recovery as an operator-explicit gesture (cost-modal-visible, telemetry-attributable) rather than runner-invisible amplification. + +## 6. Wave Bookkeeping + +### 6.1 `waveId` generation + +Per [01 §14.4](01_tree_primitives.md#144-wave-id-generation---one-rule): + +```ts +function startWave(triggerKind: WaveTriggerKind): { waveId: string; waveTriggerKind: WaveTriggerKind } { + return { waveId: uuid(), waveTriggerKind: triggerKind } +} +``` + +One `waveId` per `refreshNode` / `refreshSubtree` / `refreshTree` call. The wave never escapes the single dispatch-loop invocation that created it. + +### 6.2 `waveTriggerKind` enum + +The wire-level enum is defined in [01 §14.1](01_tree_primitives.md#141-the-data-model-addition); this table maps every operator-facing UI action that fires a wave to which of the four V1.0 enum values it carries. The caller passes the trigger kind to the runner; the runner does not infer it (the §14.4 decision was to make the source explicit). + +| UI action ([02 §2.2 / §2.3](02_tree_ui_affordances.md#22-per-node-action-rail)) | `waveTriggerKind` | Version | +|---|---|---| +| Node `↻` (per-node Refresh) | `refresh_node` | V1.0 | +| Node shift-`↻` / right-click "Refresh subtree" | `refresh_subtree` | V1.0 | +| Canvas-ribbon "Refresh tree" button | `refresh_tree` | V1.0 | +| Auto-trigger on first `addNode(send)` after authoring | `refresh_node` | V1.0 | +| Fan `+` (Add another variant) — runner refreshes the new variant alone | `refresh_node` | V1.0 | +| Fan-axis change (destructive op with confirm) | `refresh_subtree` | V1.0 | +| `branchToNewTree` → operator immediately edits & refreshes the cloned tree | `refresh_tree` | V1.0 | +| `↻×N` Re-run multiple (promotes Send to attempt-fan, runs all N children) | `refresh_subtree` | V1.0 | +| Auto-reverse opens a historical AR → no immediate wave (the AR is already executed) | (no wave generated) | V1.0 | +| Operator clicks Retry-failed in the wave-complete toast | `retry_failed` | V1.0 | +| Stack-`+` adds a synced peer set → runner refreshes all peers | `synced_peer_add` | **V1.1** (depends on Synced-Peers Stack) | +| Cross-tree refresh (refresh B's root against A's current root — conceptually a cross-tree rebase) | `cross_tree_rebase` | V2.1+ | + +**Reflog drawer "Make current"** does NOT appear in this table because `makeCurrent` itself generates no wave — it's a pure pointer swap per [01 §6.7 step 6](01_tree_primitives.md#67-makecurrent---destructive-promotion-from-the-reflog). The operator's subsequent Refresh of the now-stale descendants is the wave-generating event, and it carries `refresh_subtree` (per [01 §14.4 note](01_tree_primitives.md#144-wave-id-generation---one-rule)). + +**Earlier 11-value enum collapsed.** Revision 15 (per reviewer Finding 1) absorbed five V1.0-specific kinds (`initial_send`, `fan_expand`, `fan_axis_change`, `branch_rebase`, `rerun_multiple`) into the three core verbs above. The UI-action column still names every distinct trigger; the `waveTriggerKind` column tells the runner which entry-point semantics fired. See [01 §14.1](01_tree_primitives.md#141-the-data-model-addition) for the rationale. + +The enum is **closed** in V1.0 (the listed kinds are the only legal values; introducing a new kind requires bumping the runner version). Operators see the kind in the §8.2 "Recent waves" drawer label. + +The enum lives in the primitives doc per [01 §14.1](01_tree_primitives.md#141-the-data-model-addition); the UI-affordance *mapping* lives here. Two locations because the enum is a data-model fact (touches the schema) and the mapping is a UI/runner fact (touches affordances). + +### 6.3 Wave events + +```ts +export type WaveEvent = + | { kind: 'start'; waveId: string; triggerKind: WaveTriggerKind; estimatedCalls: number; treeId: ConversationTreeId } + | { kind: 'node_complete'; waveId: string; nodeId: ConversationTreeNodeId; outcome: 'success' | 'failure' } + | { + kind: 'complete'; waveId: string; + summary: { + succeeded: number; + failed: { transient: number; rate_limited: number; permanent: number }; // bucketed by [01 §6 lastError.failure_class](01_tree_primitives.md#61-states) + blocked: number; // §5.3 in-flight cascade victims (state=stale, failure_class='blocked') + cancelled: number; + reflog_evicted: number; + } + } + | { kind: 'busy'; treeId: ConversationTreeId; holderTabId: string } // §10.4 cross-tab advisory lock + | { kind: 'queued'; waveId: string; treeId: ConversationTreeId; queueDepth: number } // §10.3 per-tree queue + | { kind: 'reflog_eviction'; treeId: ConversationTreeId; nodeId: ConversationTreeNodeId; evictedExecutionId: string; preview: string } // single eviction outside a wave (e.g. makeCurrent at cap, §6.7 of primitives) + | { kind: 'operator_tag_required'; treeId: ConversationTreeId } // §2.1 entry-point shim step 1 tag-hygiene gate fired; wave never started +``` + +**Every event variant carries `emittedAt: string` (ISO-8601 UTC) (rev 18, per rubber-duck Finding C.1).** The field is implicit in the union above to keep the variant declarations readable; the sink populates it at `emitWaveEvent` callsite via a wrapper. Combined with the per-`ExecutionRecord` `dispatchedAt`/`targetFirstByteAt`/`completedAt` triple ([01 §4.6](01_tree_primitives.md#46-shared-types)), this gives the [02 §8.2 Recent waves drawer](02_tree_ui_affordances.md#82-the-v1-drawer-a-recent-waves-tab) the data it needs to render per-wave timing (wave duration = `complete.emittedAt - start.emittedAt`; per-leaf latency = `record.completedAt - record.dispatchedAt`). The [§11.1 invariants](#111-unit-testable-in-isolation-no-backend) (e.g., `inflight.size <= maxParallel`) become validatable in production rather than only in unit tests because the timestamp data is on every event and every record. Operators triaging *"the wave took 5 minutes — what was the runner doing?"* read the drawer; SREs reading aggregated logs read the same fields. + +**`complete.summary` shape (rev 16, per reviewer Findings 2 + 3).** Earlier revisions used a flat `failed: number`. The bucketed shape lets the [02 §2.3 ribbon](02_tree_ui_affordances.md#23-canvas-level-affordances) and [02 §5.14 toast](02_tree_ui_affordances.md#514-partial-failure-mid-refresh) drive separate counts/colors per failure class (`⚠ failed` for transient + permanent, `⏱ rate-limited`, `⦾ blocked`) without per-node scans. Wave aggregation iterates the wave's terminal-state leaves and buckets by `node.lastError?.failure_class`: leaves in `clean` increment `succeeded`; leaves in `failed` with class `transient`/`rate_limited`/`permanent` increment `failed.`; leaves in `stale` with `failure_class='blocked'` increment `blocked`; leaves in `cancelled` increment `cancelled`. A `failed` leaf with `lastError===null` is treated as `transient` (defensive default; should not happen by construction but the aggregator is robust). The [Retry failed] button-gating logic ([§5.3 step 4](#53-cascade-on-failure)) reads `summary.failed.transient + summary.blocked > 0` for enablement. + +**Legacy single-int helper.** Callsites that just want "how many leaves failed (any class)" can use `totalFailed(summary) = summary.failed.transient + summary.failed.rate_limited + summary.failed.permanent`; the [02 §8.2 "Recent waves" drawer](02_tree_ui_affordances.md#82-recent-waves-drawer-tab) uses this for the per-wave row's compact count. Test assertions and any analytics consumers built against the pre-rev-16 `failed: number` shape need to migrate to either `totalFailed(...)` or the bucketed fields. + +The `complete.summary.reflog_evicted` count rolls up evictions that fired during the wave so the wave-complete toast ([02 §2.3](02_tree_ui_affordances.md#23-canvas-level-affordances)) can show *"Past runs evicted: N"* in one line instead of stacking N transient markers. Standalone `reflog_eviction` events (outside a wave) still fire individually for the ribbon marker. + +The UI subscribes to wave events to drive: +- The in-canvas progress bar ([02 §2.3](02_tree_ui_affordances.md#23-canvas-level-affordances): `[ ●●●●●●○○○○ ] 6/60 (3 ✓, 0 ⚠, 1 ●)`). +- The wave-complete toast ([02 §8.1](02_tree_ui_affordances.md#81-the-v1-chain-preview-banner--confirm-modal--toast--drawer-panel)). +- The "Recent waves" drawer tab ([02 §8.2](02_tree_ui_affordances.md#82-the-v1-drawer-a-recent-waves-tab)). +- The cross-tab busy modal ([02 §2.3](02_tree_ui_affordances.md#23-canvas-level-affordances): *"Another tab is refreshing this tree. [Refresh anyway] [Wait]"*). +- The reflog-eviction ribbon marker ([02 §2.3](02_tree_ui_affordances.md#23-canvas-level-affordances): *"Past run evicted from node X. [Pin evicted run] [Increase cap]"*). + +## 7. Failure & Partial-Commit Semantics + +Per [01 §6.4](01_tree_primitives.md#64-failure--partial-commit-semantics), the runner's failure contract is: + +1. **In-flight completes.** When the operator hits Cancel (V1.1) or an early failure triggers wave-abort, any dispatched-but-not-returned `create_attack`/`add_message` calls run to completion. The runner awaits all of `inflight`; it does not abandon the promises. +2. **Not-yet-dispatched → `cancelled`.** Nodes still in `ready` (or not yet ready due to a failed parent) transition to `cancelled` rather than staying `stale`. This distinguishes "operator stopped this wave" from "the next wave hasn't happened yet." +3. **No automatic re-dispatch.** The operator triggers retry explicitly. The wave-complete toast surfaces "[Retry failed]" which re-evaluates `failed` nodes against the current tree state. Retries on partial-success leaves (§3.3) skip `create_attack` and re-run only `add_message`. +4. **Single-leaf failure does NOT abort the wave.** A 60-leaf refresh where leaf 7 fails continues to process leaves 8-60. The wave summary reports `succeeded=59, failed=1, cancelled=0`. This matches the [02 §5.14](02_tree_ui_affordances.md#514-partial-failure-mid-refresh) scenario. +5. **Within-leaf mid-chain partial commit.** Per §3.3, the leaf dispatch is `create_attack` + N `add_message` calls. If add_message #k fails (for any k from 1 to N), the AR exists on the backend with the first k-1 user-assistant turn pairs successfully sent. The k-th Send transitions to `failed`; Sends k+1..N transition back to `stale`. **All Sends in fresh_suffix that did not complete (the failed Send and all later ones) have their `node.execution` nulled** per [01 §6.4.1](01_tree_primitives.md#641-why-nodeexecution--null-on-failure-not-preserved) — this is what makes the resolver's `is_stale` predicate (§4.1) correctly identify them as needing fresh dispatch on retry. The Sends that DID complete (k-1 of them) keep their fresh ExecutionRecords pointing to the partial AR. The leaf shows `failed`. **No fast-path retry in V1.0.** The operator's retry from the toast re-dispatches the whole leaf, creating a brand-new AR and re-firing all stale Sends on the path. The partial AR remains in History as a failed-mid-chain row (operators see it; not a regression vs. today's chat tab which has the same partial-attack semantics on target errors). *V1.1* may add a partial-retry fast-path that reuses the partial AR id and skips create_attack + the already-succeeded add_messages — deferred because (a) it adds a `partialAttackResultId: string | null` field to track the reusable AR id on the failed Send (the cleaner V1.1 alternative to bringing back a `'partial'` outcome), (b) the dispatch loop grows a retry-aware branch, and (c) telemetry will show whether retries are common enough to justify the optimization. + +**Wave-abort triggers (V1.0):** the explicit Cancel chip in the wave-status banner ([02 §2.3](02_tree_ui_affordances.md#23-canvas-level-affordances)). Per §9, V1.0 ships UI-level cancellation: cancel flips a per-wave flag the dispatch loop checks at each `ready.popNext()` boundary — already-dispatched leaves still complete (step 1 above); undispatched leaves transition to `cancelled` (step 2 above). + +**Wave-abort triggers (V1.x):** V1.x adds backend-token cancellation that aborts in-flight HTTP calls too, eliminating step 1's "in-flight completes" caveat. + +## 8. Backend Call Mapping + +### 8.1 Per-leaf dispatch: `create_attack` + N `add_message`s + +Per [01 §7.1](01_tree_primitives.md#71-conversationtree-operation--backend-call) and §3.3 of this doc, each leaf's full dispatch sequence is one AR with a `create_attack` setup call plus N `add_message` calls, where N = the count of stale Sends on the leaf's root-to-leaf path (including the leaf itself). All calls share the same `attack_result_id` returned by `create_attack`; the runner passes `target_conversation_id = create_resp.conversation_id` on every `add_message`. + +| Operator intent | Backend call(s) | Notes | +|---|---|---| +| Refresh leaf SendNode (chain wholly clean upstream) | (1) POST [`/api/attacks`](../../../pyrit/backend/routes/attacks.py#L184) with `prepended_conversation` = all clean-prefix turns + assistant responses; (1) POST [`/api/attacks/{id}/messages`](../../../pyrit/backend/routes/attacks.py#L432) for the leaf's input UserTurn | Per §3.3. Two calls, one slot, one ExecutionRecord on the leaf. | +| Refresh leaf SendNode (chain stale from depth k) | (1) POST `/api/attacks` with `prepended_conversation` = clean prefix only (turns 1..k-1 plus their assistant responses); (N-k+1) POST `/api/attacks/{id}/messages` calls, one per stale Send from k to leaf | Per §3.3. N-k+2 calls total, one slot, one AR. Each interior Send in the fresh suffix gets its own ExecutionRecord that shares the leaf's AR id. | +| Refresh interior SendNode in isolation (operator clicks `↻` on an interior, not on a leaf) | Same as above where the operator-targeted Send is treated as the leaf for this dispatch sequence | The actual leaf below the targeted Send stays `stale` until separately refreshed. | +| Retry a partial-failed leaf (§7 rule 5) | Same as "chain stale from depth k" — brand-new AR, all stale Sends re-fired | No reuse of the partial AR id in V1.0; the fast-path optimization is V1.1 (gated on a future `partialAttackResultId` field). | +| Edit node params | (no backend call) | State-only; marks descendants stale per [01 §6.3](01_tree_primitives.md#63-propagation-rules). | +| Delete tree node | (no backend call) | State-only; backend ARs persist per [02 §5.16](02_tree_ui_affordances.md#516-delete-a-branch). | +| Branch from node | (no backend call) | **V1.0** (per Patch #1); cheap-refs operation per [01 §6.5](01_tree_primitives.md#65-branch-from-node---the-immutable-history-primitive). Lands by swapping the active tree (V1.0) or opening a new tab in the strip (V1.1). | + +### 8.2 Why every leaf uses `create_attack` + N `add_message`s (not one or the other alone) + +[`create_attack`](../../../pyrit/backend/services/attack_service.py#L277) is **context setup only** — it persists the `prepended_conversation` history into the new AR's conversation but does **not** invoke the target. Only [`add_message`](../../../pyrit/backend/services/attack_service.py#L570) with `send=True` fires the target call and produces an assistant response. This is existing backend semantics; the runner mirrors them. + +**Why not `create_attack` alone (with all stale turns as prepended).** A "single create_attack per leaf, no add_message" runner would create the AR with prior history but never invoke the target — operators would click Refresh to discover zero assistant outputs. Add_message is what makes the model produce something. + +**Why not `add_message` alone (extending an existing leaf's AR with a new turn).** This would be the natural fit for "operator added one more UserTurn+Send pair on the end of a clean leaf — just send the new turn against the existing AR." Rejected for V1.0: + +1. **AR-per-leaf says every leaf is its own AR** ([01 §7.2](01_tree_primitives.md#72-conversationtree-to-execution-materialization-rule)). Extending an existing AR's conversation breaks the property that `labels.conversation_tree_id` filtering returns a clean leaf set: the previously-leaf Send would now be interior, but its AR still claims it as a leaf. +2. **`add_message` is operator-and-target locked.** [`_validate_operator_match`](../../../pyrit/backend/services/attack_service.py#L682) and [`_validate_target_match`](../../../pyrit/backend/services/attack_service.py#L647) check the existing AR's labels. Cross-operator or cross-target extensions immediately 400; the runner would have to fall back to create_attack anyway. Simpler to always create_attack. +3. **The cost is dominated by token usage, not HTTP overhead.** One `create_attack` with a 12-message `prepended_conversation` plus an `add_message` costs nearly the same as a single `add_message` to a pre-existing AR — both re-send the full context to the target (PyRIT targets are not server-stateful). + +**Why the split between prepended and add_message.** `prepended_conversation` is the *cheap* way to inject clean-prefix history — one bulk insert into a new conversation, zero target calls, no operator-lock checks on individual turns. Using N add_messages to build up the clean prefix would be N round-trips, N target validations, N target calls re-firing turns the operator already had answers for. The combined approach gets the best of both: one cheap setup call for everything that doesn't need to re-fire, plus N add_messages for everything that does. The partition rule in §3.3 / §4.1 decides where the clean/fresh boundary sits. + +V1.1 may revisit `add_message`-only extension for the "extend the main path of a clean leaf by one turn" hot-path optimization if telemetry shows it matters — operationally it requires either relaxing the AR-per-leaf invariant or introducing a per-Send `parentAttackResultId` field to track "this Send extends that AR." Neither is V1.0. + +### 8.3 Future calls (V1.1+) + +| Operation | Call | Version | +|---|---|---| +| Score a leaf | POST `/api/scores` (does not exist yet) | V1.1 — needs backend route + scorer service wiring | +| Persist a ConversationTree | POST `/api/conversation_trees` (does not exist) | V2 — per [01 §11](01_tree_primitives.md#11-future-work-conversationtree-persistence) | +| Resume a persisted tree | GET `/api/conversation_trees/{id}` | V2 | + +## 9. Cancellation + +**V1.0 ships UI-level cancellation; backend-token cancellation is V1.x.** The two have different cost/value profiles and only the first is needed for the operator's "stop this 600-call refresh before it bills me $30" workflow. The runner exposes **two distinct cancel operations** so the operator can act on either the active wave or the queued waves without confusing the two: + +- **`cancelWave(treeId)`** — cancels the currently-dispatching wave; in-flight HTTP calls complete; not-yet-dispatched leaves flip to `cancelled`. Resolves when the wave is fully settled. +- **`cancelQueued(treeId)`** — drops every wave on `queueByTree[treeId]` ([§10.3](#103-backpressure-per-tree-wave-queue)) without touching the active wave. Each dropped wave emits a `WaveEvent { kind: 'complete', summary.cancelled: }` so the UI reconciles its queued banner. + +The two operations are independent: clicking the active-wave Cancel does NOT drop the queue (the next queued wave still starts when the active one settles); clicking Cancel-queued does NOT abort the active wave. Operators wanting both call both — the UI's "Cancel everything" affordance (not in V1.0; flagged for V1.1 if operators request it) would call them in sequence. + +**V1.0: UI-level cancel flag at `ready.popNext()` boundary.** The runner's per-wave loop (per §3.1 step 2b) initializes `cancelled = false` at wave start. `cancelWave(treeId)` flips the flag to `true` for the matching active wave. The dispatch loop checks the flag at each `ready.popNext()` iteration (after each leaf finishes, before the next leaf starts): + +```python +while ready and not cancelled: + n = ready.popNext() + ... +# After loop: wave settled. Flip remaining nodes to 'cancelled'. +if cancelled: + for n in S - completed_set: # everything in S that didn't finish + sink.setNodeState(treeId, n.id, 'cancelled', opts={'reason': 'operator cancelled wave'}) + sink.clearExecution(treeId, n.id) +sink.emitWaveEvent({ + kind: 'complete', waveId, + summary: { + succeeded: count(leaves in S that completed with state='clean'), + failed: { + transient: count(failed leaves where lastError.failure_class == 'transient'), + rate_limited: count(failed leaves where lastError.failure_class == 'rate_limited'), + permanent: count(failed leaves where lastError.failure_class == 'permanent'), + }, + blocked: count(leaves in S left stale with lastError.failure_class == 'blocked'), + cancelled: count(S - completed_set) if cancelled else 0, + reflog_evicted: count(reflog evictions that fired during this wave), + } +}) +``` + +**What V1.0 cancel does and does not stop:** +- ✓ Stops the runner from starting new leaf dispatches. The next `ready.popNext()` returns the cancel signal; the loop exits. +- ✓ Marks all undispatched leaves as `cancelled` so the operator sees them clearly in the wave-complete toast. +- ✗ Does NOT abort in-flight `create_attack` or `add_message` HTTP calls that are mid-flight when cancel fires. Those complete (success → recorded; failure → marked failed). Per §7 rule 1, in-flight completes is the V1.0 contract. +- ✗ Does NOT recall already-committed backend ARs. Successful leaves stay in History. + +**Backend dependency (deferred to V1.x):** the `create_attack` route has no `CancellationToken` parameter today. Adding one is the [01 §12.8](01_tree_primitives.md#128-cancellation-deferred---accepted-follow-up-v1x) follow-up. Until then, the runner cannot stop a dispatched call from completing on the backend; it can only stop subsequent dispatches (above). For a 600-call refresh, the V1.0 UI-cancel saves the operator the *unstarted* calls (potentially hundreds), which is the dominant cost — the in-flight 4 are bounded. + +**Operator surface ([02 §2.3](02_tree_ui_affordances.md#23-canvas-level-affordances)):** the wave-status banner during an in-flight wave shows `[ ●●●●●●○○○○ ] 6/60 (3 ✓, 0 ⚠, 1 ●) [Cancel]`. Clicking Cancel calls `runner.cancelWave(treeId)`. The button transitions to a disabled `[Cancelling…]` while in-flight leaves finish; the wave-complete toast then reads *"Wave cancelled: 6 ✓, 0 ⚠, 54 cancelled. [View wave]"*. The runner's `cancelWave` returns a `Promise` that resolves when the wave is fully settled (including draining the in-flight leaves), so the UI can await it before re-enabling the Refresh button. When the per-tree queue is non-empty, the banner adds a separate `[Cancel queued]` chip that calls `runner.cancelQueued(treeId)` — drops the queue without touching the active wave (see [§10.3](#103-backpressure-per-tree-wave-queue)). + +## 10. Concurrency Budget + +### 10.1 V1.0 — per-session, slot held across the full leaf sequence + +A single `Semaphore(4)` (or equivalent — Promise-counting pattern is fine) gates all dispatch in the session. With one tree per session in V1.0, this collapses to per-tree. **Each leaf's full dispatch sequence (§3.3) holds one slot for the duration** — the `create_attack` + N `add_message` calls all execute sequentially within the same slot. + +```ts +const dispatchSemaphore = new Semaphore(4) + +async function dispatch(leaf, waveId, waveTriggerKind) { + await dispatchSemaphore.acquire() + try { + // ... §3.3 body + } finally { + dispatchSemaphore.release() + } +} +``` + +### 10.2 V1.1 — per-Workspace with fair-share + +Per [01 §12.2](01_tree_primitives.md#122-concurrency-budget-maxparallel4-per-session-v10--per-workspace-v11-with-fair-share-decided): + +```ts +// Per-Workspace shared semaphore (single instance across all open trees) +const workspaceSemaphore = new Semaphore(4) + +// Per-tree "in-flight wave count" for fair-share picking. Updated by the dispatch +// wrapper below — incremented on slot acquire, decremented on slot release. +const inflightByTree = new Map() + +async function dispatchLeaf(treeId: ConversationTreeId, leaf: SendNode, waveId, waveTriggerKind) { + await workspaceSemaphore.acquire() + inflightByTree.set(treeId, (inflightByTree.get(treeId) ?? 0) + 1) + try { + // ... §3.3 body — full create_attack + N add_message sequence + } finally { + inflightByTree.set(treeId, (inflightByTree.get(treeId) ?? 1) - 1) + workspaceSemaphore.release() + } +} + +function pickNextReady(readyByTree: Map): { treeId, node } | null { + // Pick the tree with the fewest in-flight calls (fair-share) + const candidates = [...readyByTree.entries()].filter(([_, q]) => !q.isEmpty()) + if (candidates.length === 0) return null + candidates.sort(([a], [b]) => (inflightByTree.get(a) ?? 0) - (inflightByTree.get(b) ?? 0)) + const [treeId, queue] = candidates[0] + return { treeId, node: queue.pop() } +} +``` + +**Why per-Workspace and not per-target:** [01 §12.2](01_tree_primitives.md#122-concurrency-budget-maxparallel4-per-session-v10--per-workspace-v11-with-fair-share-decided) notes that `RoundRobinTarget` already handles cross-endpoint load distribution below the runner. Per-target budgeting is V1.x if real operators ask. + +### 10.3 Backpressure: per-tree wave queue + +V1.0 ships a per-tree wave queue on top of the per-session semaphore (§10.1). The semaphore is `Semaphore(4)` for in-flight leaf dispatches; the queue is keyed on `conversationTreeId` and serializes waves on the same tree. + +**The queue's lifecycle is implemented inside the [§2.1 entry-point shim step 4](#entry-point-shim-ordering-v10).** This section spec's the queue *contract* — FIFO order, no coalescing, stale-set recomputed at wave-start, banner copy. Implementers refer to §2.1 for the canonical `currentWaveByTree.set/delete` + `queueByTree.push/shift` + `queued`-event-emission code. The two module-level maps and the queue-element type are shared: + +```ts +const dispatchSemaphore = new Semaphore(4) // §10.1 in-flight cap +const queueByTree = new Map() // FIFO queue per tree +const currentWaveByTree = new Map() // sentinel for "a wave is active on this tree" + +interface WaveRequest { + waveId: string + triggerKind: WaveTriggerKind + rootNodeId: ConversationTreeNodeId // the subtree root (or tree.rootId for full-tree) + enqueuedAt: number + // The set of stale Sends is NOT stored here — it's recomputed when the wave actually + // starts (the operator may edit the tree between enqueue and dispatch); see the + // "stale-set is recomputed at wave-start" semantics below. +} +``` + +Rev-15 had a duplicate `refreshSubtree(treeId, rootNodeId, triggerKind)` pseudocode block here that referenced an undefined `_runWave` and never called `currentWaveByTree.set` — the queue was structurally unreachable (reviewer Finding 5). Rev 16 cuts the duplicate in favor of §2.1's shim spec, which wires the lifecycle correctly inside try/finally. + +**Queue semantics:** + +- **FIFO order** within a tree. Operator clicks Refresh-tree, then Refresh-subtree-X — both run; the second waits for the first to complete, then runs. +- **No automatic coalescing.** Two queued waves on the same tree run as two separate waves (two `waveId`s, two toasts, two AR-per-leaf groupings). The §3.3a debounce catches the 250ms double-click case; beyond that, operators get what they asked for. *Rationale:* coalescing wave A's stale-set into wave B is operator-invisible and would confuse "I clicked Refresh twice and got one toast." Explicit second-wave behavior maintains the mental model. +- **Stale-set is recomputed at wave-start, not at enqueue-time.** If the operator edits the tree between enqueue and dispatch, the wave dispatches against the current state. This is correct (operator's most recent intent wins) but means the wave-status banner's "estimated calls" preview should refresh when the wave moves from queued to active. +- **The wave-status banner shows queue state.** When `queueByTree.get(treeId)` is non-empty, the banner reads *"Wave in progress · 2 queued · [Cancel queued]"* — operators can clear pending waves without aborting the active one. The `[Cancel queued]` chip calls `runner.cancelQueued(treeId)` ([§9](#9-cancellation)) which drops every queued wave without touching the active one; each dropped wave emits its own `complete` event with `summary.cancelled` set to its leaf count. + +**V1.1 cross-tree behavior** per §10.2: the per-tree queues remain per-tree; the V1.1 fair-share scheduler picks from multiple trees' queues at the semaphore level. Per-tree serialization is preserved (never two waves on the same tree). + +### 10.4 Cross-tab advisory lock (V1.0) + +The §10.1/§10.2 semaphores are per-tab. Two browser tabs viewing the same `conversation_tree_id` (e.g., for the §13.1 minimal-Workspace side-by-side workflow per [01 §9.4.3](01_tree_primitives.md#943-concurrent-tab-advisory-lock-v10)) can independently fire `maxParallel=4` POSTs each — blowing the cap to 8 in-flight against one target. + +V1.0 ships a `BroadcastChannel('pyrit-runner')` **advisory lock keyed on `conversation_tree_id`**. Acquire-on-wave-start, release-on-wave-settle. Full spec including the operator-facing "Another tab is refreshing — Refresh anyway / Wait" modal is in [01 §9.4.3](01_tree_primitives.md#943-concurrent-tab-advisory-lock-v10). + +The runner's contract: + +- Every `refresh*` entry point's shim ([§2.1 entry-point shim ordering](#entry-point-shim-ordering-v10)) calls `lockManager.acquire(treeId)` as step 2, AFTER the tag-hygiene gate (step 1) and BEFORE the cost guardrail (step 3). +- If `acquire` returns `'busy'`, the runner surfaces a `WaveEvent { kind: 'busy', treeId, holderTabId }` and aborts the wave (no dispatches, no state changes; no `release` needed because no acquire succeeded). +- The UI listens for `busy` events and shows the modal. +- On wave settle — OR on any early-return from steps 3, 4, 5 (cost-modal cancel, wave queued behind another, dispatch-loop completion, dispatch-loop exception) — the shim's outer `try/finally` unconditionally calls `lockManager.release(treeId)`. The release is invariant against the early-return paths the rev-15 tag-hygiene gate (Finding 4) added to the runner; an implementer following the §2.1 shim spec cannot leak the lock. + +```ts +export interface CrossTabLockManager { + acquire(treeId: ConversationTreeId): Promise<'acquired' | 'busy'> + release(treeId: ConversationTreeId): void +} +``` + +**The lock manager is mocked in unit tests** (it's a clean boundary), and the §11.1 test list adds a `runner.crossTab.test.ts` for the lock-acquire / busy-modal / lock-release lifecycle. + +**TODO:spec** — the per-tree serialization contract is implicit above; make it an explicit invariant. Lean: at most one wave per tree in flight; concurrent refresh requests on the same tree queue or no-op (operator preference, **TBD**). + +## 11. Testing Surface + +### 11.1 Unit-testable in isolation (no backend) + +- **Topological walk correctness.** Given a hand-built tree and a stale-set, assert the dispatch order respects parent-before-child. +- **Concurrency cap.** With a stub `dispatch` that sleeps, assert `inflight.size ≤ maxParallel` throughout the wave. +- **Fair-share scheduling (V1.1).** With two trees and `maxParallel=4`, assert each tree gets ~2 in-flight slots over time. +- **State machine.** With a mock `RunnerStateSink`, assert the §5.1 three transitions fire in the right order. +- **Partial-commit on failure.** With a `dispatch` that fails leaf #7 of 60, assert leaves 8-60 still dispatch and the wave summary is correct. +- **In-flight cascade on shared-ancestor failure (§5.3).** With a chain-then-fan tree (10-deep stale prefix, 60 leaves) and a `dispatch` that fails the deepest interior Send X, assert: (a) every leaf in `ready` whose path includes X is dropped to `stale` with `lastError` referencing the failed wave, (b) the wave-summary counts them as `blocked` (not `failed`), (c) no leaf retries X via `add_message` in its own fresh_suffix, (d) the runner does NOT fire `add_message` for any blocked leaf, (e) a follow-up `retry_failed` wave includes the failed X plus its blocked descendants in S and admits them to `ready`. +- **Labels-divergence invariant (§4.3).** With a mock `attacksApi` that captures every `createAttack` and `addMessage` request, dispatch a leaf with N stale Sends and assert: (a) all N+1 captured requests' `labels` dicts are deep-equal, (b) every required label key (`operator`, `operation`, `conversation_tree_id`, `wave_id`, `wave_trigger_kind`, `tree_path`) is present in every request, (c) `parent_conversation_tree_id` is present in every request iff `tree.parentConversationTreeId !== null` (consistent omission per [§3.3a `_build_labels`](#33a-helpers-referenced-by-the-dispatch-step)). Guards against client-side regressions where a future runner refactor accidentally varies labels across the sequence. +- **Wave event sequence.** Assert `start → N × node_complete → complete` ordering. +- **`prepended_conversation` resolution.** Given a tree + leaf, assert the resolved message list matches expected. +- **200-message cap short-circuit.** Assert the leaf transitions to `failed` with the correct reason before any HTTP call fires. + +### 11.2 Needs the backend (integration tests) + +- **End-to-end `create_attack` round-trip** with realistic `prepended_conversation`. +- **Label writes propagate** to the AR's `labels` and survive a `GET /api/attacks/{id}`. +- **Labels round-trip (§4.3) — backend `_resolve_labels` regression canary.** Fire a real wave at a dev-backend leaf with 3 stale Sends; `GET /api/attacks/{ar.id}` and assert the round-tripped AR's `labels` dict matches the labels the runner sent on `create_attack` (the first call). The runner sends identical labels on every call in the sequence per the §4.3 invariant, so the round-tripped AR's labels should equal any single sent call's labels. Fails loudly if a future 0.16.x / 0.17.x backend change drifts `_resolve_labels` preference semantics under multi-piece `prepended_conversation` — the exact silent-corruption regression class the [§9.4.5](01_tree_primitives.md#945-hard-backend-dependency-relocate-_validate_operator_match) PR set anticipates. +- **Operator-lock interaction.** A wave with a leaf whose path contains a cross-operator message piece returns 400 from `add_message` (V1.1) — V1.0 with always-`create_attack` doesn't hit this path; document the V1.1 expansion test. +- **Concurrent waves across two browser tabs** confirming no cross-tab interference (V1.0 contract: independent runners, no coordination). + +### 11.3 Test scaffolding + +Proposed structure under `frontend/src/runner/__tests__/`: +- `runner.dispatch.test.ts` — §11.1 unit tests +- `runner.failure.test.ts` — partial-commit + failure cascade +- `runner.concurrency.test.ts` — semaphore + fair-share +- `runner.crossTab.test.ts` — `BroadcastChannel` lock acquire / busy / release (§10.4) +- `runner.reflog.test.ts` — eviction events, cap configurability, `pinExecution` +- `runner.materialization.test.ts` — `prepended_conversation` resolution +- `runner.integration.test.ts` — §11.2 with msw-mocked backend or real dev-server + +## 12. Open Questions + +- **Q.1 — Debounce on `refreshTree`.** §3.3 lean is "yes, in the UI button handler." Confirm with operators after first usability test. +- **Q.2 — Per-tree serialization vs. parallel waves on one tree.** §10.3 — lean is serialize per tree, but for the "edit root, click Refresh, immediately edit again, click Refresh again" pattern an operator might expect both to run. **TBD with operators.** +- **Q.3 — `prepended_conversation` >200 messages recovery.** §4.2 — the "Clone tree from a midpoint" suggestion needs an actual primitive. Resolved: V1.0 `branchToNewTree` (per [01 §6.5](01_tree_primitives.md#65-branch-from-node---the-immutable-history-primitive)) provides this — clone from any midpoint node and continue from there. V1.0 also surfaces the soft warning at 180 turns and the hard refusal at 200 per §4.2. +- **Q.4 — Streaming partial responses for very long Sends.** Out of scope per §1 Non-Goals; revisit in V2 if operator complaints about "the UI looks frozen during a 30-second target call" outnumber other priorities. +- **Q.5 — Telemetry events.** Should the runner emit OpenTelemetry spans for each dispatch, each wave, and each failure? Lean: yes, behind a feature flag, to validate the §11.1 invariants in production. **TODO:spec** — coordinate with the existing telemetry surface (search `frontend/src/services/` for the current pattern). Per [Q.S.4](#qs1-qs9-fourth-pass-rubber-duck-gate-items-rev-18) the per-leaf and per-event timing fields ship V1.0; OpenTelemetry wraps them V1.x. +- **Q.6 — Intra-wave memoization for shared stale interior Sends.** Designed in revision 14, cut in revision 15 per reviewer Finding 2; re-litigated in rev 18 per [Q.S.1](#qs1-qs9-fourth-pass-rubber-duck-gate-items-rev-18) and **DECIDED V1.0: accept-and-disclose (cache stays cut, Crescendo cost-cliff documented in [01 §1.2](01_tree_primitives.md#12-v10-known-limitations-sharp-edges-in-what-v10-does-ship))**. The mechanism (per-wave `sharedPieceCache` keyed on `node_id`, populated by the first leaf's regeneration of a shared interior Send, consulted by subsequent leaves' resolvers to fold cached pieces into `prepended_conversation` instead of re-firing the target) would collapse the 60-leaf-with-10-deep-shared-stale-prefix case from 600 to 70 calls. **Cut because** V1.0's two fan axes (`attempt`, `converter`) don't produce shared interior Sends in the trivial case — attempt-fan children diverge at the leaf-Send and converter-fan children diverge at the converter UserTurn. The chain-then-fan + Crescendo-with-depth-extension workflow IS affected; rev 18 accepted the cost cliff for V1.0 in exchange for the dumb-but-correct runner property (no per-wave cache invalidation bugs in unhappy paths). **Revisit in V1.x** with telemetry from the [Q.S.4 Crescendo experiment](#qs1-qs9-fourth-pass-rubber-duck-gate-items-rev-18) — if operators reach all-clean within 2 [Retry failed] cycles the cache stays cut; if not, the rev-14 design is restored. The `prompt`/`system_prompt`/`target` axes (V1.1+) can produce shared interior Sends and may justify the cache independent of the Crescendo workflow. +- **Q.7 — V1.x rate-limit handling: `Retry-After` header parsing + countdown timer + auto-enable.** V1.0 ships L1 diagnostic-only handling per [§3.3a `_format_api_error`](#33a-helpers-referenced-by-the-dispatch-step) and reviewer Finding 6a: leaves that hit 429 (or provider-specific rate-limit shapes) get `failure_class='rate_limited'`, surface distinctly in the wave-complete toast (`⏱ rate-limited` count), and disable [Retry failed] when all failed leaves are rate-limited. **V1.x adds:** parse the `Retry-After` response header (or provider-specific equivalents like Anthropic's `x-ratelimit-reset` epoch); render a countdown timer on the [Retry failed] button; auto-enable when the countdown expires. The leaf-failure-class field shipping in V1.0 makes V1.x a non-breaking addition — the migration is a UI/timer + per-leaf `retry_after_ms: number | null` field, no structural changes to `S`, the dispatch loop, or the cascade contract. **V1.x++ (deferred further):** per-target token-bucket throttling in the dispatch loop (L3 of the design spectrum) that prevents the initial 60-failure wave by holding ready leaves until tokens replenish. Requires target-capability lookup, per-target queue, config UI; the right time is once `TargetCapabilitiesInfo.max_requests_per_minute` exposure is plumbed through the runner. +- **Q.G.1 — Provider-specific rate-limit detection registry.** `_format_api_error`'s rate-limit detection needs a small mapping table of (status_code, error_code, response-body-snippet) tuples per provider: HTTP 429 covers most, but Anthropic's `overloaded_error` (sometimes HTTP 529), OpenAI's `rate_limit_exceeded` error code, Azure's specific shape, and Google's quota-exceeded responses each need their own match. **Lean for V1.0:** small registry at `frontend/src/runner/rateLimitDetection.ts` consumed by `_is_provider_rate_limit_shape(error)`. Per-provider entries are easy to add and don't require backend changes. **Promote to backend (V1.x+)** if the V1.x token-bucket throttling story lands — the backend already knows which provider each target maps to, so server-side detection avoids client-side maintenance of the registry. +- **Q.H.1 — Label inheritance for prepended pieces hydrated from pre-V1.0 ARs.** Under [01 §13.1 `openTreeFromAttackResult`](01_tree_primitives.md#131-v10-minimal-workspace) (Nit H), the first Refresh on a minted tree fires `create_attack` with `prepended_conversation` populated from the source AR's pieces (which have no `conversation_tree_id` label). Backend [`_resolve_labels` at attack_service.py:L716](../../../pyrit/backend/services/attack_service.py#L716) prefers existing piece labels over request labels. Two choices for the prepended pieces' label state: **(a)** inherit the new tree's `conversation_tree_id` via a backend-side rewrite or a label-fill-on-write; **(b)** stay un-labelled, preserving backend append-only semantics. **Lean: (b)** — History filter by `conversation_tree_id` returns only the new tree's leaves; operators who want to trace the legacy provenance use History filter by `conversation_id`. Needs a sentence of agreement in the [§9.4.5](01_tree_primitives.md#945-hard-backend-dependency-relocate-_validate_operator_match) PR description so reviewers see the choice. Does NOT affect the runner's labels-divergence invariant ([§4.3](#43-label-writes-the-round-trip-fidelity-contract)) — that invariant is about labels the runner writes on its own create_attack/add_message calls within one leaf's dispatch, which all carry identical labels per call by construction. +- **Q.R.1 — Drained-wave cost-modal suppression (V1.x).** The [§2.1 entry-point shim](#entry-point-shim-ordering-v10)'s queue-drain loop re-enters via `await refreshSubtree(...)` for each queued wave — every drained wave re-runs the full shim including step 3 (cost modal). Operator-hostile when 5+ waves are queued: the operator approved the top-level wave, but the cost modal fires again for each drained one. **Lean for V1.x:** suppress the cost modal on drained waves (the operator's queue-time confirmation propagates to drained successors); the suppression should respect the count-threshold for SAFETY (if the drained wave is unexpectedly large — say, due to operator edits between enqueue and dispatch widening the stale-set — still fire the modal). Mechanism: pass a `fromDrain: boolean` flag through the shim and bypass the cost guardrail when `fromDrain && estimatedCalls <= 2 * approvedCountFromOriginatingWave`. Out of V1.0 because V1.0 ships single-tree single-wave-at-a-time as the common case (§1.2); queue depth >1 is rare without the V1.1 tab strip. + +### Q.S.1–Q.S.9: Fourth-pass + rubber-duck gate items (rev 18) + +Formalized from the rev-18 rubber-duck review. **Q.S.1 and Q.S.2 are DECIDED V1.0** (rev 18; see entries below). **Q.S.3 remains a V1.0 BLOCKER candidate** gated on the [Q.S.4 Crescendo experiment](#qs1-qs9-fourth-pass-rubber-duck-gate-items-rev-18) outcome. Q.S.5–Q.S.9 are PR-sized follow-ups that do not gate implementer onboarding. + +- **Q.S.1 — Intra-wave memoization: DECIDED V1.0 — accept-and-disclose (rev 18).** The rev-15 Q.6 cut argued "V1.0's two fan axes don't produce shared interior Sends." Rubber-duck Finding B.1 demonstrated this is true only for trivial cases: chain-then-fan trees with edits high up the chain — Crescendo with depth-extension ([crescendo.py:L74](../../../pyrit/executor/attack/multi_turn/crescendo.py#L74)) — produce the 60-leaf/10-deep-shared-stale-prefix case (600 add_message calls instead of ~70). **Decision:** V1.0 does NOT ship the rev-14 `sharedPieceCache`; the cost cliff is documented in [01 §1.2 known limitations](01_tree_primitives.md#12-v10-known-limitations-sharp-edges-in-what-v10-does-ship) so operators discover it via documentation, not the cost modal mid-refresh. The [02 §8.1](02_tree_ui_affordances.md#81-the-v1-chain-preview-banner--confirm-modal--toast--drawer-panel) cost-guardrail modal intercepts at 20 calls and the [02 §2.2](02_tree_ui_affordances.md#22-per-node-action-rail) `↻` tooltip cost-preview surfaces the cost on hover, so operators are forewarned at click time. V1.x revisits via [Q.6](#12-open-questions) with telemetry from the [Q.S.4](#qs1-qs9-fourth-pass-rubber-duck-gate-items-rev-18) Crescendo experiment — if the experiment shows operators reach all-clean within 2 [Retry failed] cycles, the cache stays cut; if not, the rev-14 design is restored. Rationale for accept-and-disclose: V1.0's runner-correctness story is small and well-tested; layering a per-wave cache adds invalidation bugs in unhappy paths (mid-wave cancel, leaf-edit-during-wave) that the V1.0 design has otherwise eliminated by construction. Accept-the-cost preserves the dumb-but-correct property until telemetry justifies the complexity. + +- **Q.S.2 — Operator-as-tag vs operator-as-claim: DECIDED V1.0 — operator-as-tag (honor-system), rev 18.** Per rubber-duck Finding B.2: [§9.1](01_tree_primitives.md#91-operator-isolation-posture) had framed `operator` as "a tag the operator picks for History grouping + per-operator AR isolation, **not an auth claim**" while [§9.4.5](01_tree_primitives.md#945-hard-backend-dependency-relocate-_validate_operator_match) demanded the backend TIGHTEN `_validate_operator_match` to "reject anonymous requests against operator-owned ARs." These implied different mental models. **Decision:** operator-as-tag wins. §9.4.5 scaled back to relocation-only (no anonymous-rejection); the no-labels early-return is preserved by design — anonymous callers pass through unchallenged because the tag is honor-system, not an auth claim. The "Branch from here is the escape hatch" framing in §9.1 stays consistent: any operator can branch any tree they can read, creating a fresh AR under their own tag with no auth gate. **The V1.0 posture defends against accidental mis-attribution and casual cross-operator extensions, not against motivated bypass.** V1.1 multi-operator collaboration ([01 §13.8](01_tree_primitives.md#138-multi-operator-collaboration-v2)) revisits whether the tag should be promoted to a claim — if yes, the escape-hatch primitive needs a confirmation step at that time. V1.0 ships honor-system. + +- **Q.S.3 — Per-target rate-limit circuit breaker (V1.0 BLOCKER candidate).** Per rubber-duck Finding B.5: AR-per-leaf's "each leaf is independent" claim is true at the data layer but **false at the rate-limit layer** — a 60-leaf attempt-fan against a 60-RPM target dispatches 60 leaves, collects 60 separate 429s, the [Retry failed] button is disabled when all failures are rate-limited (operator's only recourse is *"wait, click Refresh tree, watch the same thing happen, repeat"*). The Q.7 deferral of `Retry-After` parsing to V1.x compounds this. **The decision is:** add a per-target circuit breaker to the dispatch loop — when N consecutive 429s land within W seconds against one `target_registry_name`, halt further dispatches to that target for the rate-limit window (or a backoff). Add to [§10](#10-concurrency--maxparallel) as §10.5. Out of V1.0 only if the [Q.S.4](#qs1-qs9-fourth-pass-rubber-duck-gate-items-rev-18) Crescendo experiment shows operators reach all-clean within 2 [Retry failed] cycles; in if they don't. + +- **Q.S.4 — Crescendo de-risk experiment (test plan; gates Q.S.1 + Q.S.3).** Per rubber-duck Finding E. Build a 60-leaf Crescendo-shaped tree in a throwaway test rig pointing at a real `gpt-4o` endpoint with a 60-RPM rate limit (or a `RoundRobinTarget` configured to simulate one). Click Refresh tree. Measure: (a) how many 429s land, (b) what the wave-complete toast says (including the new `✋ needs-fix` bucket from rev 18), (c) what the operator's `[Retry failed]` experience looks like across 2+ cycles, (d) total wall-clock to all-clean. Three possible outcomes: (1) operator clicks Retry twice and it works — V1.0 is fine without Q.S.1 + Q.S.3; (2) operator clicks Retry 8 times across 10 minutes and it eventually works — V1.0 needs Q.S.3 before ship, Q.S.1 deferred; (3) operator never reaches all-clean — V1.0 needs both Q.S.1 + Q.S.3 before ship. One day of work; cleanly de-risks the largest cost-cliff in the spec. Should run before the runner PR opens, not after. + +- **Q.S.5 — Transform-reconciliation unification: one React effect instead of two runner walkers (V1.1 candidate).** Per rubber-duck Finding B.4: [§3.1 step 6 `reconcileAllTransforms`](#31-topological-walk) (wave-end, tree-wide) and the per-dispatch `reconcileTransformStates` (path-scoped) are two places that must stay in sync — adding a new transform-state rule in V1.1 requires updating both. Reviewer's structural alternative: own transform-state reconciliation in *one* place — a React effect that subscribes to "Send state went to `clean`" events and re-runs the per-node rule on the tree. Removes both runner-side invocations; the runner stops owning anything but its three Send transitions ([§5.1](#51-the-runner-only-owns-three-transitions)). **Defer to V1.1** because (a) the V1.0 two-place approach is correct and rev-15 reviewer-blessed; (b) the React-effect migration moves the runner's state-ownership boundary, which is bigger than a docs-only patch; (c) ScoreNode V1.0 render-only scope already minimizes the cost of the duplication. Revisit when V1.1's `runScorer(node_id)` makes ScoreNode a dispatch-class node and the reconciliation surface grows. + +- **Q.S.6 — Accessibility follow-up doc (V1.0 deliverable; half-day scope).** Per rubber-duck Finding C.2: the docs are silent on focus management when layout shifts move a focused node off-screen; screen-reader announcement strategy for a 60-leaf fan completion; keyboard discoverability of the `+` edge affordance which is hover-only ([02 §2.1](02_tree_ui_affordances.md#21-per-edge-insert-on-edge-)); tab order through the per-node action rail. **The deliverable** is a 04_accessibility.md doc enumerating the keyboard-nav state machine, the focus-restore-on-layout-shift policy, and the screen-reader announcement throttling rules. Tractable in a half-day; not architecturally interesting but blocking for WCAG 2.1 AA-mandated security-team deployments. + +- **Q.S.7 — `pieceCache` cross-tab read-after-write semantics (V1.x; documentation).** Per rubber-duck Finding C.4: [§3.3a piece-fetch caching](#33a-helpers-referenced-by-the-dispatch-step) spells out the pre-fetch mechanism but doesn't address: if tab A holds the lock, mutates pieces (via `add_message`), releases; tab B acquires, pre-fetches the same pieces — is the GET guaranteed to see tab A's writes? For SQLite with default isolation (the PyRIT default per `pyrit/backend/services/attack_service.py` session config) this is fine (committed = visible). For hypothetical PostgreSQL with REPEATABLE READ it's less obvious. **The fix** is a paragraph in §3.3a naming the assumed database isolation level ("read-committed or stronger") and the V1.0 single-user deployment context that makes the assumption safe. V2 multi-operator path ([01 §13.8](01_tree_primitives.md#138-multi-operator-collaboration-v2)) needs to revisit. Documentation patch; ~50 words. + +- **Q.S.8 — Collapse `RootPromptNode` + `ImportMessageNode` into one `SourceNode { source: 'root' | 'import' }` (V1.x refactor).** Per rubber-duck Finding D.1: the two kinds differ only in source payload; both occupy the same side-effect class in the runner ([§4.1 "Source" branch](#41-the-resolved-root-to-leaf-path--prepended-final-user-turn)); the runner treats them identically through every spine. The current 6-kind taxonomy is 4 kinds masquerading as 6; collapsing Root/Import saves one `kind` branch in `conversationTreeToReactFlow`, one file under `frontend/src/components/Tree/nodes/`, and one branch in every consumer that switches on `kind`. **Defer to V1.x** because the V1.0 two-kind split is documented and the V1.0 implementer cost of carrying both kinds is one extra file (small). Revisit when the editor surface for each kind diverges enough to make the union shape awkward, or when V1.1 adds a third source variant (e.g., "import from local JSON" per [01 §1.2 export/import gap](01_tree_primitives.md#12-v10-known-limitations-sharp-edges-in-what-v10-does-ship)) and the rename becomes the natural moment. + +- **Q.S.9 — Pure-event-log alternative reconsideration for V1.x scoping (decision point at V2 boundary).** Per rubber-duck Finding D.4: the ConversationTree-vs-AttackResult split's rejection of "pure event log + projection" was too curt for a decision V2 server-side collaboration will reopen. The V1.0 design already implements *most* of an event log expensively reimplemented as four separate mechanisms: §6.9 undo with state-snapshot widening, §9.4.3 BroadcastChannel advisory lock, §9.4.1 labels-decoding reload reconstruction, §10.3 per-tree wave queue. A pure event log would unify them. **The decision is:** revisit this explicitly at the V2 server-side trees scoping milestone (not before — V1.0 / V1.1 are committed to the ConversationTree shape). The V2 PR should weigh (a) event-sourcing rewrite vs (b) extending the V1.x ConversationTree with a `version` field ([already added in V1.0 per rev-18 §3.1](01_tree_primitives.md#3-data-model)) + a server-side last-write-wins resolver. Decision point, not gate item. + +## Appendix: Runner Module Structure (Proposed) + +``` +frontend/src/runner/ +├── runner.ts # public Runner interface + dispatch loop (§3) +├── materialization.ts # resolve_prepended_conversation (§4.1) +├── stateSink.ts # RunnerStateSink interface + React-bound impl +├── waveBookkeeping.ts # waveId + waveTriggerKind enum (§6) +├── concurrency.ts # Semaphore + fair-share pick (§10) +├── costGuardrail.ts # threshold check + modal trigger (§2.3) +└── __tests__/ + ├── runner.dispatch.test.ts + ├── runner.failure.test.ts + ├── runner.concurrency.test.ts + ├── runner.materialization.test.ts + └── runner.integration.test.ts +``` + +The split keeps the dispatch loop (§3) under ~150 LOC by delegating; everything else is testable in isolation per §11.1. diff --git a/doc/gui/design/04_tree_ui_v1_shipability_plan.md b/doc/gui/design/04_tree_ui_v1_shipability_plan.md new file mode 100644 index 0000000000..f631e77e33 --- /dev/null +++ b/doc/gui/design/04_tree_ui_v1_shipability_plan.md @@ -0,0 +1,523 @@ +# Tree UI V1.0 Shipability Action Plan + +Status: implementation-guide checkpoint after live browser review (revision 2) — confirmed deliverables: Chat-to-tree opens the full active attack as a merged tree; adding a prompt auto-creates its pending response without exposing Send vocabulary; V1.0 hides future-only dead-end controls; fans can be pruned to a picked path without deleting backend history; Tree View has a resizable tree-left/path-chat-right split; converters are visible transform nodes; Playwright is the MVP acceptance harness. +Scope: finish the V1.0 operator experience without expanding into V1.1 architecture. This session's output is a descriptive implementation guide: each confirmed decision lands with current-state evidence, implementation notes, acceptance criteria, and alternatives considered. + +## Exit Criteria + +A reviewer can use only the browser, with `VITE_ENABLE_TREE_UI=true`, and complete the full Tree UI happy path plus recovery flows without reading code: + +1. Open an existing attack as a tree from Chat or History; attacks with multiple conversations reconstruct as one merged tree, not as a single selected conversation. +2. Inspect the selected path as a chat-style transcript beside the tree canvas, with a resizable split that keeps tree context visible. +3. Add a follow-up prompt after a response and immediately see the linked pending response that Refresh will produce. +4. Create attempt fans and visible converter-transform branches with understandable controls. +5. Edit prompts/converters, see stale propagation, and refresh successfully or get a clear preflight reason why refresh is blocked. +6. Recover from mistakes via delete confirmation, fan prune-to-picked-path, branch clone, dirty-edit guard, and visible past-run state. +7. Use only visible controls that either work in V1.0 or explain a current preflight blocker; no normal MVP path exposes future-only disabled buttons. +8. Reload the page and get an honest restored tree or an explicit degraded-state explanation. +9. Navigate the canvas with keyboard, mouse, minimap, controls, and sane auto-layout behavior. +10. Pass the mocked Playwright MVP acceptance suite, with screenshots and layout assertions for the critical Tree UI states. + +## P0: Must Fix Before V1.0 Sign-Off + +### 1. Chat-To-Tree Full-Attack Reconstruction + +Problem: +Tree View is currently reachable from History, but not from the loaded Chat attack. More importantly, current open-as-tree reconstruction is single-conversation shaped: live browser validation of a History row with 30 messages and 3 conversations produced a 12-node linear tree with 0 fan nodes and 0 stack summaries. That does not meet the operator goal of using Tree View to understand the whole attack graph from the place they are already working. + +Deliverables: +- Add an `Open tree` action to the Chat ribbon when `attackResultId` is present. Place it next to the conversations-panel toggle so it reads as another view of the active attack, not as a per-message branch action. +- Route the Chat action through the same dirty-edit guarded tree swap used by History open-as-tree. +- Replace or extend the current single-AR `useAutoReverse(openFromAttackResultId)` path with an attack-level reconstruction mode: + - fetch the `AttackSummary`, + - fetch `getConversations(attackResultId)`, + - fetch messages for every active conversation returned by that list, + - build one `ConversationTree` by merging identical prefixes and branching at the first divergent turn, + - preserve target registry name and operator/operation labels on the reconstructed root/tree metadata. +- Keep History `Open as tree`, but make it use the same full-attack reconstruction path. A History row's `Convs > 1` must not silently reconstruct only `AttackSummary.conversation_id`. +- If prefix merging cannot confidently infer a shared branch point, reconstruct the main conversation and show an explicit degraded-state banner naming how many related conversations were omitted. + +Implementation notes: +- Chat entry wiring lives in `frontend/src/App.tsx` and `frontend/src/components/Chat/ChatWindow.tsx`. Add a prop such as `onOpenAttackAsTree?: (attackResultId: string) => void` to `ChatWindow` and have App reuse the existing `handleOpenAttackAsTree` dirty-swap path. +- Reconstruction lifecycle currently lives in `frontend/src/components/Tree/useAutoReverse.ts`, which fetches only `getAttack` and `getMessages(ar.conversation_id)`. Introduce a second hook or mode whose dependency surface also includes `getConversations`. +- Pure merge logic belongs beside `linearChainFromMessages` in `frontend/src/runner/autoReverse.ts`, not inside React components. Keep it unit-testable with arrays of conversation message lists. +- The merge key should be conservative: role + converted text/value + converter identifiers + attachment identity where available. When keys differ, branch. When required identity is missing, degrade explicitly rather than guessing. +- Preserve the URL fragment behavior: V1.0+ trees with `labels.conversation_tree_id` keep that id; pre-tree attacks get a fresh client tree id. + +Acceptance: +- From Chat, a loaded attack with one conversation opens Tree View to the same content currently available from History open-as-tree. +- From Chat, a loaded attack with multiple conversations opens one tree containing all active conversations under the attack. +- From History, a row with `Convs > 1` uses the same full-attack reconstruction and does not silently drop related conversations. +- Shared prefixes appear once; divergent turns become branches/fans with stable slot ordering. +- The empty Tree View state points operators to both Chat-loaded attacks and History, not only History. +- Browser tests cover Chat open-as-tree for one-conversation and multi-conversation attacks; History open-as-tree for a multi-conversation attack; and degraded-state copy when merge inference is not possible. + +Alternatives considered: +Keeping History-only open-as-tree was rejected because it forces operators out of the Chat context where they notice the need for a tree. Adding a Chat button that reuses the current single-conversation auto-reverse was rejected because it looks convenient while silently dropping related conversations. Replacing Chat's conversation panel with Tree View was rejected as too disruptive for V1.0 and contrary to the non-goal of replacing the linear chat experience. + +### 2. Auto-Create Response Placeholder / No Send Vocabulary + +Problem: +The live Tree UI exposes the internal runner shape too directly. A follow-up action on an assistant response currently creates only `User turn (edited) -> New prompt`, leaving a dangling prompt with no visible response target. At the same time, fully coalescing prompt and response into one rendered card would make the important `prompt -> converter -> response` workflow harder to represent honestly. Operators should see a prompt-response path, but they should not need to understand a separate `SendNode` concept. + +Deliverables: +- Keep the internal `UserTurnNode -> SendNode` data/runner model for V1.0. +- Change `Add follow-up prompt` so it creates both nodes in one structural edit: + - `Assistant response -> User prompt (edited) -> Assistant response placeholder (stale/dirty)`. +- Render the child `SendNode` as a response-state card, never as `Send`: + - clean with response text: `Assistant response`, + - stale/edited/draft without response text: `Pending response` or `Response pending refresh`, + - failed with prior response text: show the previous response separately from the latest error. +- Keep the prompt-to-response edge as the insertion point for converter transforms. `Append converter` inserts a visible converter transform node between the prompt and response placeholder; converter-node behavior is specified in §10. +- Preserve the direct prompt-to-response path as the no-converter baseline unless the operator explicitly replaces it. +- Remove or avoid user-facing `Send` vocabulary in buttons, menus, labels, tests, and empty states. Implementation names can remain `SendNode` in TypeScript. + +Implementation notes: +- Structural behavior belongs in `frontend/src/runner/treeStateReducer.ts`. Add a helper such as `applyAppendPromptWithResponse(parentResponseId, uuid)` or extend `applyAppendChild` for the follow-up path so both nodes are created atomically. +- `frontend/src/components/Tree/SendCard.tsx` should choose user-facing labels from response state. The current `kindLabel = node.state === 'draft' || node.state === 'edited' ? 'Send' : 'Assistant response'` should become response vocabulary for every state. +- `frontend/src/components/Tree/InsertEdge.tsx` should keep the prompt-to-response edge menu focused on legal V1.0 actions: append converter transform, compare converters, and response refresh path. Avoid future disabled entries in the normal MVP menu. +- Converter transform nodes must stay non-side-effecting with respect to the target: they prepare converter IDs / preview output; downstream response nodes remain the target-call refresh points. +- Dirty-edit detection should count the newly created prompt/response pair as one operator edit for modal copy, even if two internal nodes were added. + +Acceptance: +- Clicking `Add follow-up prompt` on an assistant response creates an edited prompt and a visible pending response placeholder in one action. +- The pending response placeholder is the obvious refresh target and participates in cost preview/stale propagation. +- No visible card, tooltip, menu item, or empty-state copy says `Send`. +- The prompt-to-response edge offers `Append converter`; choosing a converter inserts a visible transform node and keeps the downstream response placeholder linked. +- Browser tests cover add-follow-up creating the paired pending response, converter insertion on the prompt-to-response edge, and absence of visible `Send` vocabulary in the normal Tree UI. + +Alternatives considered: +Keeping the current explicit response/send behavior was rejected because it leaves a dangling prompt after the dominant follow-up action. Coalescing prompt and response into one rendered card was rejected after considering `prompt -> converter -> response`: the converter belongs between prompt and response, and hiding that relationship inside a combined card makes the edge affordance dishonest. Hiding converter state as only prompt-card chips was rejected after converter research because converters carry parameters, supported input/output data types, previews, possible LLM cost, and direct-vs-converted branch semantics that operators need to see. + +### 3. Hide Future-Only Dead-End Controls + +Problem: +Live browser review found disabled future affordances in normal operator paths: `Branch as subtree (coming in a future release)` appears on action rails, and `Fan out: prompt (coming later)` appears in the user-turn edge menu. These controls are intentionally unavailable, but their presence makes the MVP feel unfinished and forces operators to learn which visible controls to ignore. + +Deliverables: +- Hide future-only controls from the normal V1.0 operator UI. +- Remove `Branch as subtree` from the action rail unless a dev/review flag explicitly enables future-slot preview. +- Remove disabled future fan axes from edge insert menus in normal V1.0. The menu should show only actions that can be completed now. +- Keep disabled states only for current, actionable preflight blockers, such as no target selected, operator tag missing, no converter selected, wave already running, or insufficient permissions. +- Replace any visible `coming later`, `future release`, or equivalent tooltip copy with either no control or a current-state explanation. +- If future-slot preview is useful for implementers, gate it behind a clearly named development flag and exclude it from browser tests that represent the operator MVP. + +Implementation notes: +- `frontend/src/components/Tree/actionRail.tsx` currently renders a disabled `Branch as subtree` button unconditionally. Gate or remove it for normal V1.0. +- `frontend/src/components/Tree/InsertEdge.tsx` currently includes disabled V1.1 fan-axis rows. Filter those options out unless future-slot preview is enabled. +- Keep layout stable through CSS spacing and responsive constraints rather than reserving dead controls. +- Search UI code and tests for strings like `coming later`, `future release`, `Available in a future release`, and `V1.1` before sign-off; normal operator snapshots should not contain them. + +Acceptance: +- In normal V1.0 mode, action rails expose only implemented actions or actions blocked by a current preflight condition. +- Edge insert menus expose only implemented V1.0 actions. +- No visible tooltip/menu/button copy says `coming later`, `future release`, or equivalent on the normal Tree UI path. +- Browser tests cover the response action rail and user-turn edge menu and assert future-only controls are absent. +- Any optional future-slot preview is gated behind a dev/review flag and is not enabled for MVP validation. + +Alternatives considered: +Keeping disabled future slots visible was rejected because slot reservation is less important than operator confidence in a first MVP. Improving tooltip copy was rejected because the core problem is the visible dead end, not the exact explanation. Showing future controls only in a development/review mode remains acceptable because it keeps implementation inspection possible without leaking unfinished affordances to operators. + +### 4. Prune Fan To Picked Path + +Problem: +The current UI has two different fan-removal-adjacent behaviors, neither of which matches the operator goal. `Collapse to stack` is visual only; it hides repeated fan children but keeps the fan. `Delete` removes the selected node and all descendants, which is too destructive when the operator has compared variants and wants to keep the useful path. Operators need a way to finish a fan comparison by removing the fan wrapper and extra variants from the client tree while preserving one selected path/subtree. + +Deliverables: +- Add a fan-level action named `Prune to picked path`. +- If `FanNode.params.promotedChildSlotIndex` is set, pruning keeps that slot. +- If no child is picked, clicking the action opens a small chooser listing variants by slot, state, and response/prompt preview. +- Show a confirmation modal before pruning: + - identify the kept slot, + - state how many other variants/subtrees will be removed from this tree, + - state that backend `AttackResult` history is not deleted. +- Rewire the client tree so `parent -> fan -> pickedChild` becomes `parent -> pickedChild`. +- Preserve the picked child node, execution, execution history, descendants, and edge slot semantics where relevant. +- Remove the fan node and non-picked variant subtrees from client tree state. Do not delete backend attacks, messages, or history entries. + +Implementation notes: +- Pure structural logic belongs in `frontend/src/runner/treeStateReducer.ts`, e.g. `applyPruneFanToPickedPath(tree, fanNodeId, slotIndex)`. +- The reducer should locate the incoming edge to the fan, the outgoing edge for the kept slot, and every descendant of non-kept children. It should remove the fan plus non-kept descendants, update the kept child `parentId` to the fan's former parent, and replace the incoming/outgoing edges with one parent-to-kept-child edge using the fan's former incoming `slotIndex`. +- If the kept child is itself a response placeholder or has descendants, preserve that entire subtree unchanged except for the kept child's new `parentId`. +- Add a host callback in `TreeRunnerHost` and fan-specific action in `FanCard`. The action should sit near Pick/Collapse controls, not in the generic Delete path. +- Undo should treat prune as one structural operation if the existing undo stack is wired for structural edits; if not, document undo as a follow-up rather than silently half-supporting it. + +Acceptance: +- Operator can create a 3-attempt fan, pick one attempt, prune to that picked path, and see the fan card disappear while the picked response/subtree remains. +- If no attempt is picked, the prune action asks the operator which slot to keep before showing the confirmation. +- Pruning removes only client tree nodes for non-picked variants; backend History still contains prior `AttackResult`s. +- Cost preview and refresh behavior still work on the preserved path after pruning. +- Browser tests cover pick-then-prune, chooser-then-prune, cancel confirmation, and preservation of the picked subtree. + +Alternatives considered: +Keeping only visual stack collapse was rejected because it does not clean up the tree after comparison. Deleting the whole fan subtree was rejected because it destroys useful work and makes fan experimentation feel unsafe. Promoting all fan children by removing only the wrapper was rejected for V1.0 because it can create multiple siblings in a place where the parent expected one path, making the canvas harder to reason about. Per-variant delete remains useful as a later complement, but it does not replace the dominant "keep the winner" workflow. + +### 5. Resizable Tree / Path Chat Split View + +Problem: +The current `Open in linear view` action opens an in-tree details drawer with a `Path` section. That is useful metadata inspection, but it creates a weaker second linear surface rather than giving operators the normal text-message reading experience while they reason about branches. Jumping back to the existing Chat tab would preserve a canonical chat surface, but it drops tree context. For MVP, Tree View should combine both: tree structure and a readable chat transcript for the selected path. + +Deliverables: +- Keep the existing Chat tab unchanged for V1.0. +- Change Tree View into a two-pane workspace: + - left pane: tree canvas, + - right pane: chat-style transcript for the selected root-to-node path. +- Make the tree/chat split resizable with a visible drag handle and keyboard-accessible resize controls. +- Persist the split width for the current browser session or workspace settings, with sane min/max widths so neither pane becomes unusable. +- Selecting a node in the tree updates the path chat to that root-to-node path. +- Selecting a message/bubble in the path chat highlights and scrolls to the corresponding tree node. +- Pending response placeholders render as pending assistant bubbles in the path chat. +- Converter transform steps render as compact transform bubbles in the path chat and correspond to visible converter nodes in the tree. +- Keep structure-only actions on the tree canvas. Path-chat actions can include transcript-native conveniences, but they must call the same tree edit callbacks as the canvas. + +Implementation notes: +- `TreeRunnerHost` should own the selected node/path state and pass it to both `TreeCanvas` and the new path-chat pane. +- Extract reusable message-bubble presentation from `frontend/src/components/Chat/MessageList.tsx` if practical; avoid importing Chat's attack/conversation orchestration into Tree View. +- Add a pure path projection helper near tree utilities: given `ConversationTree` + selected node id, return ordered transcript entries with node ids, roles, text, converter transform steps, execution/error state, and pending-response status. +- The right pane should not pretend to be the backend Chat tab. It is a selected-path transcript over the client tree, including unpersisted edits and pending response placeholders. +- Replace or demote `Open in linear view`; if a details drawer remains, label it as `Details` or `Inspect node`, not as a separate linear view. +- Use CSS grid/flex with explicit min widths and a drag handle. Avoid overlaying the path chat as a drawer on top of the tree, since the goal is simultaneous context. + +Acceptance: +- Tree View opens with tree canvas on the left and selected-path chat transcript on the right. +- The split can be resized by pointer and keyboard, and both panes remain usable at supported desktop widths. +- Selecting tree nodes updates the transcript; selecting transcript bubbles highlights the corresponding tree node. +- Adding a follow-up prompt updates both panes and shows the pending assistant response in the path chat. +- Existing Chat tab behavior is unchanged. +- Browser tests cover default split rendering, resize behavior, tree-to-chat selection sync, chat-to-tree selection sync, pending response bubble rendering, and absence of the misleading `Open in linear view` label. + +Alternatives considered: +Keeping the current details drawer was rejected because it makes the path transcript feel secondary and separate from the main workflow. Navigating to the existing Chat tab was rejected because it loses branch context at the moment the operator is reasoning about a tree. Replacing the existing Chat tab with Tree View was rejected as too disruptive for MVP. A fixed split was rejected because trees and transcripts vary widely in width; operators need to decide which pane gets space for the current task. + +### 6. Converter Transform Nodes And Comparisons + +Problem: +Converter research shows converters are not just labels or prompt-card chips. They are backend registry instances with type-specific parameters, supported input/output data types, preview behavior, optional LLM-backed cost, and persisted `converter_identifiers` on message pieces. Operators also need to compare direct and converted paths naturally, e.g. `prompt -> direct response` alongside `prompt -> converter(options) -> response`. Hiding converters inside a prompt card makes that branch structure and provenance too hard to see. + +Deliverables: +- Add a visible `ConverterNode` (or equivalent named transform node) to the Tree UI model for V1.0. +- A converter node is a transform/configuration step, not a target-call node. Downstream response nodes remain the refresh/target-call points. +- `Append converter` on a prompt-to-response edge inserts `prompt -> converter -> response placeholder`. +- Preserve or offer an explicit direct baseline path: `prompt -> response placeholder` with no converter. +- `Compare converters` creates a converter comparison structure that can fan over converter pipelines/options while keeping the direct baseline visible when requested. +- Converter nodes show: + - converter type and display name, + - configured parameters, + - pipeline order when multiple converters are chained, + - supported input/output data types, + - preview status/output when available, + - LLM-based/cost warning when applicable. +- Tree refresh sends `converter_ids` to the backend. Preview output is inspection-only and must not become the sent message content unless the operator explicitly converts it into prompt text. +- Empty converter branches must be explicit states: `Choose converter`, `Configure converter`, or `No converter baseline`. They must not look like complete duplicate prompts. + +Implementation notes: +- Reuse/extract the existing Chat converter panel behavior where practical: catalog lookup, parameter form, converter instance creation, preview, and `Use Converted Value` semantics. Do not duplicate converter-type introspection logic in Tree-only code. +- Store configured converter references as backend `converter_id`s whenever possible, matching `AddMessageRequest.converter_ids`. Inline converter refs from reload reconstruction can still render read-only or require re-registration before refresh if no backend id is available. +- The runner already resolves `UserTurnNode.params.converterPipeline` into `converter_ids`. Introducing visible converter nodes requires either: + - a reducer/resolver pass that folds converter-node pipelines into the next downstream user turn before dispatch, or + - a small runner extension where `resolvePathPartition` accumulates converter nodes between a prompt and response into the entry's converter pipeline. +- Converter nodes should validate data-type compatibility against the upstream piece(s) and the downstream target's supported modalities before refresh. +- LLM-backed converters and file/media-output converters require explicit preview/run actions; do not auto-preview them on every edit. +- Path-chat rendering should show converter nodes as compact transform bubbles between the user prompt and assistant response, with preview clearly labeled as preview. + +Acceptance: +- Operator can create `prompt -> direct response` and `prompt -> converter -> response` sibling paths and see the difference without opening a drawer. +- Operator can configure converter type/params from the converter node and preview supported conversions. +- Refresh of a converted path sends converter IDs to the backend, not locally previewed converted text. +- Direct/no-converter baseline is visually distinct from unconfigured converter variants. +- Data-type incompatibility, missing required params, and LLM-backed preview cost are surfaced before refresh. +- Path-chat shows converter transform bubbles in the selected path. +- Browser tests cover append converter insertion, direct-vs-converted sibling paths, converter comparison variants, preview vs refresh behavior, and data-type/preflight warnings. + +Alternatives considered: +Keeping converters as prompt/edge chips only was rejected because it hides meaningful branch structure and converter provenance. Treating converters as side-effecting response nodes was rejected because converters transform prompts and do not call the target. Keeping only `Fan(axis='converter')` without visible converter cards was rejected because it exposes fan mechanics while hiding transformation intent. Limiting MVP converter fan to pre-existing simple text-to-text converters was rejected as too narrow for Co-PyRIT's converter catalog, which includes modality-changing and LLM-backed converters. + +### 7. Playwright MVP Acceptance Harness + +Problem: +Live browser exploration found product mismatches that source review alone did not make visceral: History open-as-tree dropped related conversations, add-follow-up produced a dangling prompt, future-only disabled controls appeared in normal menus, and `Open in linear view` opened a secondary drawer rather than a chat-like surface. The MVP needs a repeatable browser harness that validates the operator experience, not only unit-level tree reducers. + +Deliverables: +- Add or expand a Playwright suite such as `frontend/e2e/tree-mvp.spec.ts` for the Tree UI MVP acceptance path. +- Use mocked API routes for deterministic coverage of core workflows, including multi-conversation attacks, converter catalogs, converter previews, target metadata, missing-target responses, wave summaries, and reload reconstruction. +- Keep live backend/model smoke tests separate and optional. They can validate integration when credentials/targets exist, but they must not be the only MVP sign-off path. +- Capture screenshots for key states: + - empty Tree View, + - Chat open-as-tree entry, + - merged multi-conversation tree, + - pending response placeholder, + - resizable tree/path-chat split, + - converter transform branch with direct baseline, + - attempt fan before and after prune-to-picked-path, + - degraded reconstruction banner. +- Add visual/layout assertions in addition to text assertions: + - no overlapping node cards or rails, + - selected card action rail visible, + - edge insert chips targetable, + - split panes above minimum width, + - text not clipped in critical buttons/chips, + - no visible `Send`, `coming later`, or `future release` copy in normal V1.0 mode. + +Implementation notes: +- Build on existing `frontend/e2e/tree.spec.ts` route-mocking style, but separate MVP acceptance flows from narrower regression tests if the file becomes too large. +- Prefer explicit fixtures for one-conversation, multi-conversation shared-prefix, multi-conversation degraded-merge, attempt fan, converter transform, and missing-target attacks. +- Use Playwright locators by role/test id for behavior assertions and screenshots for visual review. Avoid relying only on screenshots for pass/fail. +- Store screenshots as test artifacts rather than committing generated images unless the repo already uses checked-in baselines. +- Run the suite with `VITE_ENABLE_TREE_UI=true` and document the command in the guide or test README if it differs from existing e2e commands. + +Acceptance: +- A reviewer can run one Playwright MVP suite and cover every P0 implementation surface in this guide. +- The suite fails if Chat open-as-tree is absent, multi-conversation reconstruction silently drops conversations, add-follow-up lacks a pending response, future-only controls appear, converter transforms are hidden, fan prune loses the picked path, or split panes become unusable. +- The suite emits screenshots/artifacts that make layout regressions reviewable without manual reproduction. +- Unit tests still cover reducer/resolver details; Playwright covers the operator-facing workflow. + +Alternatives considered: +Ad hoc browser checks were rejected because they already found issues but are too easy to forget. Unit-only coverage was rejected because it misses layout, action rail, menu, and mental-model regressions. Requiring live backend/model tests for every sign-off was rejected as too slow and environment-dependent; live smoke remains useful but optional. + +## Validation / Regression Gates + +These items are already implemented or substantially implemented based on source and live-browser review. They remain V1.0 sign-off gates, but they are tracked as validation/regression coverage rather than fresh P0 implementation work. + +### Gate A. Target Registry Recovery / Refresh Preflight + +Current evidence: +Source review shows `AttackSummary.target.target_registry_name` is present in frontend DTOs, auto-reverse populates `RootPrompt.params.targetRegistryName`, and Tree refresh intercepts missing target before dispatch. Live browser review also showed recovered trees with a target value on the root card. + +Validation: +- Historical Open-as-tree from an attack with a target reconstructs a refreshable root target. +- Historical Open-as-tree from an attack without recoverable target does not fail later with an empty-target 404; it tells the operator before dispatch. +- Missing-target UI uses an explicit root warning and modal/banner before any backend call. +- Unit + browser tests cover both recovered-target and missing-target cases. + +Residual risk: +Historical data may still lack recoverable target registry names. That is acceptable only if the UI blocks refresh with a clear preflight explanation. + +### Gate B. Attempt Fan Count Picker + +Current evidence: +Live browser review showed `Fan out response attempts` opening an attempt-count dialog and creating a 3-attempt fan with stale response leaves. Source review shows validation for 2-50 attempts and e2e coverage for a 5-attempt fan. + +Validation: +- Operator can create 2, 3, 5, and larger attempt fans within the allowed range. +- Created fan has correct variant count, child count, slot indices, and no duplicate slot ids. +- Cost preview reflects the created leaves. +- Browser tests cover creating a 5-attempt fan and rejecting invalid counts. + +Residual risk: +Attempt fan cleanup is handled by the new `Prune to picked path` P0 item; this gate only validates creation. + +### Gate C. Branch From Here True Subtree/Path Semantics + +Current evidence: +Source review shows root `Clone tree` and non-root `Branch from here` are separate reducer paths. Non-root branching keeps the root-to-selected path plus selected descendants and excludes unrelated sibling branches. + +Validation: +- Root action remains `Clone tree` and clones the whole tree. +- Branching from a middle node creates a new tree containing only the path and selected subtree. +- Sibling branches outside the selected subtree are absent. +- Included nodes preserve executions/history. +- Refresh labels include `parent_conversation_tree_id` for cloned/branched trees. +- Unit tests cover root clone vs non-root branch; browser tests cover branch from a non-root response. + +Residual risk: +The UI label must stay honest after the split-view work. Non-root action should still read `Branch from here`; root action should still read `Clone tree`. + +### Gate D. Long Response Inspection + +Current evidence: +Tree cards already keep previews short and the current details drawer can inspect full node/path content. The newly accepted split workspace supersedes the drawer as the primary long-content surface: selected-path chat becomes the normal way to read full prompts/responses. + +Validation: +- Canvas cards remain short and stable; long content does not expand layout unpredictably. +- Selected-path chat pane shows full prompt/response text without leaving Tree View. +- Latest error state is distinguishable from previous successful response preview. +- Execution id, AR id, conversation id, wave id, target, converter transforms, and copy affordances remain available either in path chat or a clearly labeled metadata/details surface. +- Browser tests cover long text in the path-chat pane and verify no card overlap/regression on the canvas. + +Residual risk: +If the details drawer remains after split-view lands, it must be labeled as metadata/details, not as a separate linear chat experience. + +## P1: Should Fix Before Wider Internal Rollout + +### 8. Auto-Layout and Fit Behavior + +Problem: +Long chains and expanded editors can push useful content offscreen. Fit View can make cards too small. Manual dragging helps, but layout needs a first-class reset/autolayout story. + +Deliverables: +- Add explicit `Auto layout` / `Reset layout` control. +- Preserve manual positions during the current tree session. +- Reset manual positions on tree swap or operator command. +- Tune `fitViewOptions`, `minZoom`, and initial viewport so cards are readable. +- Consider `Fit current path` for long chains. +- Ensure layout accounts for expanded editor/detail states or gives enough vertical space. + +Acceptance: +- Opening a long linear tree starts with readable cards, not microscopic cards. +- Operator can drag nodes, then restore deterministic layout. +- No node/card overlap after edit mode opens. +- Browser tests cover overlap checks at desktop and narrow viewport. + +### 9. Action Rail Discoverability + +Problem: +Hover/focus floating rail is visually cleaner but not always discoverable. + +Deliverables: +- Keep the rail on hover/focus and selected card. +- Consider a persistent compact actions trigger per card. +- Ensure selected card always shows rail. +- Confirm touch/mobile behavior. +- Ensure rail never covers editor Save/Cancel controls or nearby cards. + +Acceptance: +- Keyboard and mouse users can discover and operate actions without guessing. +- Rails do not overlap important content. +- Accessibility labels are unique enough to avoid duplicate-reader confusion. + +### 10. Edge Handles and Insert Chips Polish + +Problem: +Handles are now at the card perimeter, but edge/handle/chip alignment still needs visual review on larger and zoomed-out trees. + +Deliverables: +- Align handles with edge paths visually at common zoom levels. +- Increase edge insert chip hit target if needed. +- Show insert chip on selected edge/path, not only hover, if discoverability is weak. +- Ensure chips do not flicker during node dragging. + +Acceptance: +- Moving nodes does not make edges/chips visually flicker in a distracting way. +- Insert chip remains targetable at normal zoom. +- Edge handles do not appear inside card content. + +### 11. Delete Confirmation Detail + +Problem: +Delete now confirms, but the copy is generic. + +Deliverables: +- Show deleted subtree count. +- Mention selected node kind. +- Optionally list first few affected node kinds. +- Consider special copy for deleting a fan branch vs a plain chain. + +Acceptance: +- Operator understands the blast radius before deleting. +- Root delete remains unavailable. + +### 12. Past Runs / Execution Detail + +Problem: +PastRunsDrawer exists, but the integrated host/drawer experience has not had enough browser validation. + +Deliverables: +- Wire selected-node detail/past-runs drawer if not already fully integrated. +- Show current execution and reflog entries. +- Pin/unpin works. +- Checkout remains disabled/deferred unless implemented honestly. +- UUID truncation with full title works. + +Acceptance: +- Browser test covers a node with current execution and history. +- Pin/unpin persists in host state. + +### 13. Wave Toast, Retry, Cancel Queued + +Problem: +Wave status ribbon is usable; wave-complete toast/retry/queued flows need browser-level confidence. + +Deliverables: +- Browser validate running, cancel, queued, cancel queued. +- Browser validate complete summary buckets. +- Browser validate retry failed behavior with mock failures. + +Acceptance: +- Operators get immediate feedback after Refresh. +- Failed/rate-limited/permanent/blocked outcomes are understandable. + +## P2: Nice-To-Have / V1.1 Candidates + +### 14. Server-Side Target/Tree Metadata Improvements + +- Persist `conversation_tree_node_id` only if/when V2 server-side tree persistence is accepted. +- Add richer target lineage metadata for historical reconstruction. +- Add server-side tree storage if client-only persistence becomes insufficient. + +### 15. Synced-Peers / Stack Authoring + +- Do not add synced-peer authoring in V1.0 unless operators explicitly ask. +- Keep fan-child stack visual aggregation only. + +### 16. Advanced Graph Interactions + +- Drag-and-drop from a node palette. +- Add node on edge drop. +- Keyboard shortcuts beyond basic editor save/cancel. +- Layout animation, if it helps comprehension and does not add flicker. + +## Browser / E2E Coverage Plan + +### Existing Coverage + +- `frontend/e2e/tree.spec.ts` covers: + - Tree View greenfield, + - History Open-as-tree, + - response preview reconstruction, + - add follow-up prompt, + - attempt fan creation, + - converter fan creation. + +### Add Next — Implementation Coverage + +1. Playwright MVP acceptance suite that runs the P0 operator path with mocked APIs and emits screenshots/artifacts. +2. Chat open-as-tree for a one-conversation loaded attack. +3. Chat open-as-tree for a multi-conversation loaded attack, asserting all conversations appear in one tree. +4. History open-as-tree for a multi-conversation row, asserting related conversations are not silently dropped. +5. Degraded reconstruction banner when full-attack merge inference is not possible. +6. Add-follow-up creates an edited prompt plus visible pending response placeholder. +7. Prompt-to-response edge supports converter-node insertion and keeps the downstream response placeholder linked. +8. Normal Tree UI has no visible `Send` vocabulary. +9. Normal Tree UI hides future-only dead-end controls from action rails and edge menus. +10. Attempt fan prune-to-picked-path keeps the selected subtree and removes the fan wrapper/client-only variants. +11. Tree View renders a resizable tree-left/path-chat-right split with selection sync both directions. +12. Converter transform nodes: append converter, direct baseline, comparison variants, preview-vs-refresh behavior, and preflight warnings. + +### Add Next — Regression Gate Coverage + +1. Missing target preflight. +2. Attempt fan count picker, including invalid count rejection. +3. Branch from non-root excludes sibling branches. +4. Long-response inspection in the selected-path chat pane, with metadata details available without reintroducing a separate linear view. +5. Delete confirmation cancel/confirm. +6. Reload from `#conversation_tree_id` with fan labels. +7. Dark/light canvas chrome screenshot/assertions. +8. Narrow viewport layout/overlap checks. + +## Recommended Next Work Order + +1. Chat-to-tree full-attack reconstruction. +2. Auto-create response placeholder / no Send vocabulary. +3. Hide future-only dead-end controls. +4. Resizable tree-left/path-chat-right split view. +5. Prune fan to picked path. +6. Converter transform nodes and comparisons. +7. Playwright MVP acceptance harness. +8. Auto layout/reset layout. +9. Past runs / wave toast browser validation. +10. Delete confirmation details. + +## Validation / Regression Work Order + +1. Target registry recovery/preflight. +2. Attempt fan count picker. +3. Branch-from-here true subtree/path semantics. +4. Long-response inspection via selected-path chat. + +## Non-Goals For This Checkpoint + +- Server-side tree persistence. +- Synced-peer stack authoring. +- New fan axes beyond attempt/converter. +- Replacing the linear chat experience. +- Major redesign of the runner dispatch model. diff --git a/frontend/e2e/tree-mvp.spec.ts b/frontend/e2e/tree-mvp.spec.ts new file mode 100644 index 0000000000..578a8f09ae --- /dev/null +++ b/frontend/e2e/tree-mvp.spec.ts @@ -0,0 +1,260 @@ +import { test, expect, type Page, type TestInfo } from "@playwright/test"; + +interface MockAttackSummary { + attack_result_id: string; + conversation_id: string; + attack_type: string; + target?: { target_registry_name?: string | null; target_type: string; model_name?: string | null } | null; + converters: string[]; + outcome?: "success" | "failure" | "undetermined" | null; + last_message_preview?: string | null; + message_count: number; + related_conversation_ids: string[]; + labels: Record; + created_at: string; + updated_at: string; +} + +const ATTACK_ID = "atk-tree-mvp"; +const MAIN_CONVERSATION_ID = "conv-main"; +const BRANCH_CONVERSATION_ID = "conv-branch"; +const TREE_ID = "tree-mvp"; + +const ATTACK: MockAttackSummary = { + attack_result_id: ATTACK_ID, + conversation_id: MAIN_CONVERSATION_ID, + attack_type: "ManualAttack", + target: { target_registry_name: "OpenAIChatTarget::mvp", target_type: "OpenAIChatTarget", model_name: "gpt-4o" }, + converters: [], + outcome: "undetermined", + last_message_preview: "Answer A", + message_count: 8, + related_conversation_ids: [BRANCH_CONVERSATION_ID], + labels: { operator: "tree_mvp", operation: "tree_mvp", conversation_tree_id: TREE_ID }, + created_at: "2026-06-12T00:00:00Z", + updated_at: "2026-06-12T00:01:00Z", +}; + +function piece(turn: number, role: string, value: string, pieceId: string) { + return { + turn_number: turn, + role, + pieces: [ + { + piece_id: pieceId, + original_value_data_type: "text", + converted_value_data_type: "text", + original_value: value, + converted_value: value, + scores: [], + response_error: "none", + original_prompt_id: pieceId, + converter_identifiers: [], + }, + ], + created_at: "2026-06-12T00:00:00Z", + }; +} + +const MAIN_MESSAGES = { + conversation_id: MAIN_CONVERSATION_ID, + messages: [ + piece(1, "user", "Root prompt", "main-p1"), + piece(2, "assistant", "Shared answer", "main-p2"), + piece(3, "user", "Follow A", "main-p3"), + piece(4, "assistant", "Answer A", "main-p4"), + ], +}; + +const BRANCH_MESSAGES = { + conversation_id: BRANCH_CONVERSATION_ID, + messages: [ + piece(1, "user", "Root prompt", "branch-p1"), + piece(2, "assistant", "Shared answer", "branch-p2"), + piece(3, "user", "Follow B", "branch-p3"), + piece(4, "assistant", "Answer B", "branch-p4"), + ], +}; + +async function mockMvpApis(page: Page) { + await page.route(/\/api\/auth\/config/, async (route) => { + await route.fulfill({ status: 200, contentType: "application/json", body: JSON.stringify({ enabled: false }) }); + }); + await page.route(/\/api\/version/, async (route) => { + await route.fulfill({ + status: 200, + contentType: "application/json", + body: JSON.stringify({ version: "test", default_labels: { operator: "tree_mvp", operation: "tree_mvp" } }), + }); + }); + await page.route(/\/api\/health/, async (route) => { + await route.fulfill({ status: 200, contentType: "application/json", body: JSON.stringify({ status: "ok" }) }); + }); + await page.route(/\/api\/labels/, async (route) => { + await route.fulfill({ status: 200, contentType: "application/json", body: JSON.stringify({ source: "attacks", labels: {} }) }); + }); + await page.route(/\/api\/attacks\/attack-options/, async (route) => { + await route.fulfill({ status: 200, contentType: "application/json", body: JSON.stringify({ attack_types: ["ManualAttack"] }) }); + }); + await page.route(/\/api\/attacks\/converter-options/, async (route) => { + await route.fulfill({ status: 200, contentType: "application/json", body: JSON.stringify({ converter_types: [] }) }); + }); + await page.route(/\/api\/converters\/catalog(?:\?|$)/, async (route) => { + await route.fulfill({ + status: 200, + contentType: "application/json", + body: JSON.stringify({ + items: [ + { + converter_type: "Base64Converter", + supported_input_types: ["text"], + supported_output_types: ["text"], + is_llm_based: false, + description: "Encode text as base64.", + parameters: [ + { + name: "encoding_func", + type_name: "Literal['b64encode', 'urlsafe_b64encode']", + required: false, + default_value: "b64encode", + choices: ["b64encode", "urlsafe_b64encode"], + description: "Encoding function", + }, + ], + }, + ], + }), + }); + }); + await page.route(/\/api\/converters(?:\?|$)/, async (route) => { + if (route.request().method() === "POST") { + await route.fulfill({ + status: 200, + contentType: "application/json", + body: JSON.stringify({ converter_id: "configured-base64", converter_type: "Base64Converter" }), + }); + return; + } + await route.fulfill({ + status: 200, + contentType: "application/json", + body: JSON.stringify({ items: [{ converter_id: "base64", converter_type: "Base64Converter", display_name: "Base64" }] }), + }); + }); + await page.route(new RegExp(`/api/attacks/${ATTACK_ID}$`), async (route) => { + await route.fulfill({ status: 200, contentType: "application/json", body: JSON.stringify(ATTACK) }); + }); + await page.route(new RegExp(`/api/attacks/${ATTACK_ID}/conversations$`), async (route) => { + await route.fulfill({ + status: 200, + contentType: "application/json", + body: JSON.stringify({ + attack_result_id: ATTACK_ID, + main_conversation_id: MAIN_CONVERSATION_ID, + conversations: [ + { conversation_id: MAIN_CONVERSATION_ID, message_count: 4, last_message_preview: "Answer A" }, + { conversation_id: BRANCH_CONVERSATION_ID, message_count: 4, last_message_preview: "Answer B" }, + ], + }), + }); + }); + await page.route(new RegExp(`/api/attacks/${ATTACK_ID}/messages`), async (route) => { + const url = new URL(route.request().url()); + const conversationId = url.searchParams.get("conversation_id"); + await route.fulfill({ + status: 200, + contentType: "application/json", + body: JSON.stringify(conversationId === BRANCH_CONVERSATION_ID ? BRANCH_MESSAGES : MAIN_MESSAGES), + }); + }); + await page.route(/\/api\/attacks(?:\?|$)/, async (route) => { + if (route.request().method() !== "GET") { + await route.continue(); + return; + } + await route.fulfill({ + status: 200, + contentType: "application/json", + body: JSON.stringify({ items: [ATTACK], pagination: { limit: 25, has_more: false, next_cursor: null, prev_cursor: null } }), + }); + }); +} + +async function screenshot(page: Page, testInfo: TestInfo, name: string) { + await page.screenshot({ path: testInfo.outputPath(`${name}.png`), fullPage: true }); +} + +test.describe("Tree UI MVP acceptance", () => { + test.beforeEach(async ({ page }) => { + await mockMvpApis(page); + }); + + test("opens the loaded Chat attack as a merged tree with path chat", async ({ page }, testInfo) => { + await page.goto("/"); + await page.getByTitle("Attack History").click(); + await expect(page.getByTestId(`attack-row-${ATTACK_ID}`)).toBeVisible({ timeout: 10_000 }); + await page.getByTestId(`open-attack-${ATTACK_ID}`).click(); + await expect(page.getByText("Shared answer")).toBeVisible({ timeout: 10_000 }); + + await page.getByTestId("open-chat-attack-as-tree-btn").click(); + await expect(page.locator("[data-tree-path-chat-pane]")).toBeVisible({ timeout: 10_000 }); + await expect(page.locator("main")).toContainText("Root prompt"); + await expect(page.getByText("Follow A")).toBeVisible(); + await expect(page.getByText("Follow B")).toBeVisible(); + await expect(page.locator("[data-tree-path-chat-splitter]")).toBeVisible(); + await expect(page.locator("main")).not.toContainText(/\bSend\b|coming later|future release/i); + await screenshot(page, testInfo, "chat-open-merged-tree"); + }); + + test("adding a follow-up prompt creates a pending response", async ({ page }, testInfo) => { + await page.goto("/"); + await page.getByTitle("Attack History").click(); + await page.getByTestId(`open-attack-as-tree-${ATTACK_ID}`).click(); + await expect(page.getByText("Shared answer")).toBeVisible({ timeout: 10_000 }); + + await page.locator("[data-tree-node-id]").filter({ hasText: "Shared answer" }).locator('button[aria-label="Focus in path chat"]').first().click(); + await page.getByRole("textbox", { name: "Follow-up prompt" }).fill("New prompt from path chat"); + await page.getByRole("button", { name: "Run" }).click(); + await expect(page.locator("[data-tree-path-chat-pane]")).toContainText("New prompt from path chat"); + await expect(page.locator("[data-tree-path-chat-pane]")).toContainText("Pending response"); + await screenshot(page, testInfo, "pending-response-follow-up"); + }); + + test("attempt fan can be pruned to the picked path", async ({ page }, testInfo) => { + await page.goto("/"); + await page.getByTitle("Attack History").click(); + await page.getByTestId(`open-attack-as-tree-${ATTACK_ID}`).click(); + await expect(page.getByText("Shared answer")).toBeVisible({ timeout: 10_000 }); + + await page.locator('button[aria-label="Fan out response attempts"]').first().click(); + await page.getByRole("spinbutton", { name: "Attempt count" }).fill("3"); + await page.getByRole("button", { name: "Create" }).click(); + await expect(page.locator("main")).toContainText(/3 variants/); + await page.locator('button[aria-label="Pick this attempt"]').first().click(); + await page.locator('button[aria-label^="Prune to picked slot"]').click(); + await page.getByRole("button", { name: /^Prune$/ }).click(); + await expect(page.locator("main")).not.toContainText(/3 variants/); + await screenshot(page, testInfo, "pruned-fan"); + }); + + test("converter insertion creates a visible transform branch with direct baseline", async ({ page }, testInfo) => { + await page.goto("/"); + await page.getByTitle("Attack History").click(); + await page.getByTestId(`open-attack-as-tree-${ATTACK_ID}`).click(); + await expect(page.getByText("Follow A")).toBeVisible({ timeout: 10_000 }); + + await page.getByRole("button", { name: "Insert after user turn" }).first().click(); + await page.getByRole("menuitem", { name: "Append converter" }).click(); + await expect(page.locator("[data-tree-node-id]").filter({ hasText: "Choose converter" }).first()).toBeVisible(); + await expect(page.getByText("Answer A")).toBeVisible(); + await expect(page.getByText("Pending response")).toBeVisible(); + + await page.getByRole("button", { name: "Choose converter" }).click(); + await page.getByRole("menuitem", { name: "Configure converter..." }).click(); + await page.getByRole("combobox", { name: "Converter type" }).selectOption("Base64Converter"); + await page.getByTestId("param-encoding_func").selectOption("urlsafe_b64encode"); + await page.getByRole("button", { name: "Save" }).click(); + await expect(page.locator("[data-tree-node-id]").filter({ hasText: "Base64Converter" }).first()).toBeVisible(); + await screenshot(page, testInfo, "converter-transform-branch"); + }); +}); diff --git a/frontend/e2e/tree.spec.ts b/frontend/e2e/tree.spec.ts new file mode 100644 index 0000000000..bd2dc3a3aa --- /dev/null +++ b/frontend/e2e/tree.spec.ts @@ -0,0 +1,229 @@ +import { test, expect, type Page } from "@playwright/test"; + +interface MockAttackSummary { + attack_result_id: string; + conversation_id: string; + attack_type: string; + target?: { target_registry_name?: string | null; target_type: string; endpoint?: string | null; model_name?: string | null } | null; + converters: string[]; + outcome?: "success" | "failure" | "undetermined" | null; + last_message_preview?: string | null; + message_count: number; + related_conversation_ids: string[]; + labels: Record; + created_at: string; + updated_at: string; +} + +const TREE_ID = "tree-e2e-1"; +const ATTACK_ID = "atk-tree-e2e"; +const CONVERSATION_ID = "conv-tree-e2e"; + +const ATTACK: MockAttackSummary = { + attack_result_id: ATTACK_ID, + conversation_id: CONVERSATION_ID, + attack_type: "ManualAttack", + target: { target_registry_name: "OpenAIChatTarget::target1234", target_type: "OpenAIChatTarget", model_name: "gpt-4o" }, + converters: [], + outcome: "undetermined", + last_message_preview: "Seed assistant follow-up response", + message_count: 4, + related_conversation_ids: [], + labels: { operator: "tree_e2e", operation: "tree_ui", conversation_tree_id: TREE_ID }, + created_at: "2026-06-12T00:00:00Z", + updated_at: "2026-06-12T00:01:00Z", +}; + +const ATTACK_WITHOUT_TARGET: MockAttackSummary = { + ...ATTACK, + target: null, +}; + +let activeAttack: MockAttackSummary = ATTACK; + +const MESSAGES_RESPONSE = { + conversation_id: CONVERSATION_ID, + messages: [ + message(1, "user", "Seed root prompt", "p1"), + message(2, "assistant", "Seed assistant response", "p2"), + message(3, "user", "Seed follow-up prompt", "p3"), + message(4, "assistant", "Seed assistant follow-up response", "p4"), + ], +}; + +function message(turn: number, role: string, value: string, pieceId: string) { + return { + turn_number: turn, + role, + pieces: [ + { + piece_id: pieceId, + original_value_data_type: "text", + converted_value_data_type: "text", + original_value: value, + converted_value: value, + scores: [], + response_error: "none", + original_prompt_id: pieceId, + converter_identifiers: [], + }, + ], + created_at: "2026-06-12T00:00:00Z", + }; +} + +async function mockTreeApis(page: Page) { + await page.route(/\/api\/auth\/config/, async (route) => { + await route.fulfill({ status: 200, contentType: "application/json", body: JSON.stringify({ enabled: false }) }); + }); + await page.route(/\/api\/version/, async (route) => { + await route.fulfill({ + status: 200, + contentType: "application/json", + body: JSON.stringify({ version: "test", default_labels: { operator: "tree_e2e", operation: "tree_ui" } }), + }); + }); + await page.route(/\/api\/health/, async (route) => { + await route.fulfill({ status: 200, contentType: "application/json", body: JSON.stringify({ status: "ok" }) }); + }); + await page.route(/\/api\/labels/, async (route) => { + await route.fulfill({ + status: 200, + contentType: "application/json", + body: JSON.stringify({ source: "attacks", labels: { operator: ["tree_e2e"], operation: ["tree_ui"] } }), + }); + }); + await page.route(/\/api\/attacks\/attack-options/, async (route) => { + await route.fulfill({ status: 200, contentType: "application/json", body: JSON.stringify({ attack_types: ["ManualAttack"] }) }); + }); + await page.route(/\/api\/attacks\/converter-options/, async (route) => { + await route.fulfill({ status: 200, contentType: "application/json", body: JSON.stringify({ converter_types: [] }) }); + }); + await page.route(/\/api\/converters(?:\?|$)/, async (route) => { + await route.fulfill({ + status: 200, + contentType: "application/json", + body: JSON.stringify({ items: [{ converter_id: "base64", converter_type: "Base64Converter", display_name: "Base64" }] }), + }); + }); + await page.route(new RegExp(`/api/attacks/${ATTACK_ID}$`), async (route) => { + await route.fulfill({ status: 200, contentType: "application/json", body: JSON.stringify(activeAttack) }); + }); + await page.route(new RegExp(`/api/attacks/${ATTACK_ID}/conversations$`), async (route) => { + await route.fulfill({ + status: 200, + contentType: "application/json", + body: JSON.stringify({ + attack_result_id: ATTACK_ID, + main_conversation_id: CONVERSATION_ID, + conversations: [ + { conversation_id: CONVERSATION_ID, message_count: 4, last_message_preview: "Seed assistant follow-up response" }, + ], + }), + }); + }); + await page.route(new RegExp(`/api/attacks/${ATTACK_ID}/messages`), async (route) => { + await route.fulfill({ status: 200, contentType: "application/json", body: JSON.stringify(MESSAGES_RESPONSE) }); + }); + await page.route(/\/api\/attacks(?:\?|$)/, async (route) => { + if (route.request().method() !== "GET") { + await route.continue(); + return; + } + const url = new URL(route.request().url()); + const labels = url.searchParams.getAll("label"); + const wantsTree = labels.includes(`conversation_tree_id:${TREE_ID}`); + await route.fulfill({ + status: 200, + contentType: "application/json", + body: JSON.stringify({ + items: wantsTree || labels.length === 0 ? [activeAttack] : [], + pagination: { limit: Number(url.searchParams.get("limit") ?? 25), has_more: false, next_cursor: null, prev_cursor: null }, + }), + }); + }); +} + +async function openSeedTree(page: Page) { + await page.getByTitle("Attack History").click(); + await page.waitForTimeout(300); + const discard = page.getByRole("button", { name: /Discard and continue/i }); + if (await discard.isVisible().catch(() => false)) { + await discard.click(); + } + const row = page.getByTestId(`attack-row-${ATTACK_ID}`); + await expect(row).toBeVisible({ timeout: 10_000 }); + await page.getByTestId(`open-attack-as-tree-${ATTACK_ID}`).click(); + if (await discard.isVisible().catch(() => false)) { + await discard.click(); + await expect(row).toBeVisible({ timeout: 10_000 }); + await page.getByTestId(`open-attack-as-tree-${ATTACK_ID}`).click(); + } + await expect(page.getByText("Seed assistant response")).toBeVisible({ timeout: 10_000 }); +} + +test.describe("Tree UI", () => { + test.beforeEach(async ({ page }) => { + activeAttack = ATTACK; + await mockTreeApis(page); + }); + + test("opens history attack as a tree and reconstructs response previews", async ({ page }) => { + await page.goto("/"); + await page.getByTitle("Tree View").click(); + await expect(page.getByText(/No tree loaded/i)).toBeVisible(); + + await page.getByTitle("Attack History").click(); + await expect(page.getByTestId(`attack-row-${ATTACK_ID}`)).toBeVisible({ timeout: 10_000 }); + await page.getByTestId(`open-attack-as-tree-${ATTACK_ID}`).click(); + + await expect(page).toHaveURL(new RegExp(`conversation_tree_id=${TREE_ID}`)); + await expect(page.getByText("Seed root prompt").first()).toBeVisible(); + await expect(page.getByText("OpenAIChatTarget::target1234")).toBeVisible(); + await expect(page.getByText("Seed assistant response")).toBeVisible(); + await expect(page.getByText("Seed follow-up prompt")).toBeVisible(); + await expect(page.getByText("Seed assistant follow-up response")).toBeVisible(); + }); + + test("adds a follow-up prompt after a response", async ({ page }) => { + await page.goto("/"); + await openSeedTree(page); + + await page.getByRole("button", { name: "Add follow-up prompt" }).first().click(); + await expect(page.getByText("New prompt")).toBeVisible(); + }); + + test("creates an attempt fan from a response", async ({ page }) => { + await page.goto("/"); + await openSeedTree(page); + + await page.getByRole("button", { name: "Fan out response attempts" }).first().click(); + await expect(page.getByRole("dialog", { name: "Fan out response attempts" })).toBeVisible(); + await page.getByRole("spinbutton", { name: "Attempt count" }).fill("5"); + await page.getByRole("button", { name: "Create" }).click(); + await expect(page.locator("main")).toContainText(/axis:\s*attempt/); + await expect(page.locator("main")).toContainText(/5 variants/i); + }); + + test("creates a converter fan from a response", async ({ page }) => { + await page.goto("/"); + await openSeedTree(page); + + await page.getByRole("button", { name: "Compare converters" }).first().click(); + await expect(page.locator("main")).toContainText(/axis:\s*converter/); + await expect(page.locator("main")).toContainText("New prompt"); + }); + + test("shows no-target preflight before refreshing a recovered tree without a target", async ({ page }) => { + activeAttack = ATTACK_WITHOUT_TARGET; + await page.goto("/"); + await openSeedTree(page); + + await expect(page.getByText("No target")).toBeVisible(); + await page.getByRole("button", { name: "Edit root prompt" }).click(); + await page.getByRole("textbox", { name: "Prompt text" }).fill("Edited seed root prompt"); + await page.getByRole("button", { name: "Save" }).click(); + await page.getByRole("button", { name: /Refresh/ }).first().click(); + await expect(page.getByRole("dialog", { name: "No target selected" })).toBeVisible(); + }); +}); diff --git a/frontend/jest.config.ts b/frontend/jest.config.ts index 6cdf3bd10b..20516f3df2 100644 --- a/frontend/jest.config.ts +++ b/frontend/jest.config.ts @@ -8,6 +8,12 @@ const config: Config = { moduleNameMapper: { "^@/(.*)$": "/src/$1", "\\.(css|less|scss|sass)$": "identity-obj-proxy", + // d3-hierarchy ships ESM source as `main`. ts-jest's transform + // ignores `.js` and jest's CJS require trips on its `import` + // statements. Redirect to the UMD bundle at /dist which works + // under CJS without any transform. Production keeps the ESM + // path (Vite handles it natively). + "^d3-hierarchy$": "/node_modules/d3-hierarchy/dist/d3-hierarchy.js", }, setupFilesAfterEnv: ["/src/setupTests.ts"], collectCoverageFrom: [ diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 9d388d6d24..4e6c657287 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -12,7 +12,9 @@ "@azure/msal-react": "^5.4.3", "@fluentui/react-components": "9.74.1", "@fluentui/react-icons": "2.0.329", + "@xyflow/react": "^12.11.0", "axios": "1.17.0", + "d3-hierarchy": "^3.1.2", "react": "19.2.7", "react-dom": "19.2.7", "react-error-boundary": "6.1.2" @@ -24,6 +26,7 @@ "@testing-library/jest-dom": "6.9.1", "@testing-library/react": "16.3.2", "@testing-library/user-event": "14.6.1", + "@types/d3-hierarchy": "^3.1.7", "@types/jest": "30.0.0", "@types/node": "25.9.2", "@types/react": "19.2.17", @@ -31,6 +34,7 @@ "@typescript-eslint/eslint-plugin": "8.61.0", "@typescript-eslint/parser": "8.61.0", "@vitejs/plugin-react": "6.0.2", + "broadcast-channel": "^7.3.0", "esbuild": "0.28.1", "eslint": "10.4.1", "eslint-plugin-react-hooks": "7.1.1", @@ -570,9 +574,9 @@ } }, "node_modules/@babel/runtime": { - "version": "7.28.4", - "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.28.4.tgz", - "integrity": "sha512-Q/N6JNWvIvPnLDvjlE1OUBLPQHH6l3CltCEsHIujp45zQUSSh8K+gHnaEX45yAT1nyngnINhvWtzN+Nb9D8RAQ==", + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.28.6.tgz", + "integrity": "sha512-05WQkdpL9COIMz4LjTxGpPNCdlpyimKppYNoJ5Di5EUObifl8t4tuLuUBBZEpoLYOmfvIWrsp9fCl0HoPRVTdA==", "license": "MIT", "engines": { "node": ">=6.9.0" @@ -4145,6 +4149,62 @@ "@babel/types": "^7.28.2" } }, + "node_modules/@types/d3-color": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/@types/d3-color/-/d3-color-3.1.3.tgz", + "integrity": "sha512-iO90scth9WAbmgv7ogoq57O9YpKmFBbmoEoCHDB2xMBY0+/KVrqAaCDyCE16dUspeOvIxFFRI+0sEtqDqy2b4A==", + "license": "MIT" + }, + "node_modules/@types/d3-drag": { + "version": "3.0.7", + "resolved": "https://registry.npmjs.org/@types/d3-drag/-/d3-drag-3.0.7.tgz", + "integrity": "sha512-HE3jVKlzU9AaMazNufooRJ5ZpWmLIoc90A37WU2JMmeq28w1FQqCZswHZ3xR+SuxYftzHq6WU6KJHvqxKzTxxQ==", + "license": "MIT", + "dependencies": { + "@types/d3-selection": "*" + } + }, + "node_modules/@types/d3-hierarchy": { + "version": "3.1.7", + "resolved": "https://registry.npmjs.org/@types/d3-hierarchy/-/d3-hierarchy-3.1.7.tgz", + "integrity": "sha512-tJFtNoYBtRtkNysX1Xq4sxtjK8YgoWUNpIiUee0/jHGRwqvzYxkq0hGVbbOGSz+JgFxxRu4K8nb3YpG3CMARtg==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/d3-interpolate": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/@types/d3-interpolate/-/d3-interpolate-3.0.4.tgz", + "integrity": "sha512-mgLPETlrpVV1YRJIglr4Ez47g7Yxjl1lj7YKsiMCb27VJH9W8NVM6Bb9d8kkpG/uAQS5AmbA48q2IAolKKo1MA==", + "license": "MIT", + "dependencies": { + "@types/d3-color": "*" + } + }, + "node_modules/@types/d3-selection": { + "version": "3.0.11", + "resolved": "https://registry.npmjs.org/@types/d3-selection/-/d3-selection-3.0.11.tgz", + "integrity": "sha512-bhAXu23DJWsrI45xafYpkQ4NtcKMwWnAC/vKrd2l+nxMFuvOT3XMYTIj2opv8vq8AO5Yh7Qac/nSeP/3zjTK0w==", + "license": "MIT" + }, + "node_modules/@types/d3-transition": { + "version": "3.0.9", + "resolved": "https://registry.npmjs.org/@types/d3-transition/-/d3-transition-3.0.9.tgz", + "integrity": "sha512-uZS5shfxzO3rGlu0cC3bjmMFKsXv+SmZZcgp0KD22ts4uGXp5EVYGzu/0YdwZeKmddhcAccYtREJKkPfXkZuCg==", + "license": "MIT", + "dependencies": { + "@types/d3-selection": "*" + } + }, + "node_modules/@types/d3-zoom": { + "version": "3.0.8", + "resolved": "https://registry.npmjs.org/@types/d3-zoom/-/d3-zoom-3.0.8.tgz", + "integrity": "sha512-iqMC4/YlFCSlO8+2Ii1GGGliCAY4XdeG748w5vQUbevlbDu0zSjH/+jojorQVBK/se0j6DUFNPBGSqD3YWYnDw==", + "license": "MIT", + "dependencies": { + "@types/d3-interpolate": "*", + "@types/d3-selection": "*" + } + }, "node_modules/@types/esrecurse": { "version": "4.3.1", "resolved": "https://registry.npmjs.org/@types/esrecurse/-/esrecurse-4.3.1.tgz", @@ -4885,6 +4945,48 @@ } } }, + "node_modules/@xyflow/react": { + "version": "12.11.0", + "resolved": "https://registry.npmjs.org/@xyflow/react/-/react-12.11.0.tgz", + "integrity": "sha512-na4IO33FSs2OS72hASgZDmTYwFAkef7Z74uBUVrong3ARmQQHfnRUVaCFn1kTt5LbS6pK03TbYjCPGLjLFfziA==", + "license": "MIT", + "dependencies": { + "@xyflow/system": "0.0.77", + "classcat": "^5.0.3", + "zustand": "^4.4.0" + }, + "peerDependencies": { + "@types/react": ">=17", + "@types/react-dom": ">=17", + "react": ">=17", + "react-dom": ">=17" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "@types/react-dom": { + "optional": true + } + } + }, + "node_modules/@xyflow/system": { + "version": "0.0.77", + "resolved": "https://registry.npmjs.org/@xyflow/system/-/system-0.0.77.tgz", + "integrity": "sha512-qCDCMCQAAgUu8yHnhloHG9F5mwPX5E+Wl8McpYIOPSSXfzFJJoZcwOcsDiAjitVKIg2de1WmJbCHfpcvxprsgg==", + "license": "MIT", + "dependencies": { + "@types/d3-drag": "^3.0.7", + "@types/d3-interpolate": "^3.0.4", + "@types/d3-selection": "^3.0.10", + "@types/d3-transition": "^3.0.8", + "@types/d3-zoom": "^3.0.8", + "d3-drag": "^3.0.0", + "d3-interpolate": "^3.0.1", + "d3-selection": "^3.0.0", + "d3-zoom": "^3.0.0" + } + }, "node_modules/acorn": { "version": "8.16.0", "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.16.0.tgz", @@ -5206,6 +5308,22 @@ "node": "18 || 20 || >=22" } }, + "node_modules/broadcast-channel": { + "version": "7.3.0", + "resolved": "https://registry.npmjs.org/broadcast-channel/-/broadcast-channel-7.3.0.tgz", + "integrity": "sha512-UHPhLBQKfQ8OmMFMpmPfO5dRakyA1vsfiDGWTYNvChYol65tbuhivPEGgZZiuetorvExdvxaWiBy/ym1Ty08yA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/runtime": "7.28.6", + "oblivious-set": "2.0.0", + "p-queue": "6.6.2", + "unload": "2.4.1" + }, + "funding": { + "url": "https://github.com/sponsors/pubkey" + } + }, "node_modules/browserslist": { "version": "4.28.2", "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.28.2.tgz", @@ -5374,6 +5492,12 @@ "dev": true, "license": "MIT" }, + "node_modules/classcat": { + "version": "5.0.5", + "resolved": "https://registry.npmjs.org/classcat/-/classcat-5.0.5.tgz", + "integrity": "sha512-JhZUT7JFcQy/EzW605k/ktHtncoo9vnyW/2GspNYwFlN1C/WmjuV/xtS04e9SOkL2sTdw0VAZ2UGCcQ9lR6p6w==", + "license": "MIT" + }, "node_modules/cliui": { "version": "8.0.1", "resolved": "https://registry.npmjs.org/cliui/-/cliui-8.0.1.tgz", @@ -5555,6 +5679,120 @@ "integrity": "sha512-z1HGKcYy2xA8AGQfwrn0PAy+PB7X/GSj3UVJW9qKyn43xWa+gl5nXmU4qqLMRzWVLFC8KusUX8T/0kCiOYpAIQ==", "license": "MIT" }, + "node_modules/d3-color": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/d3-color/-/d3-color-3.1.0.tgz", + "integrity": "sha512-zg/chbXyeBtMQ1LbD/WSoW2DpC3I0mpmPdW+ynRTj/x2DAWYrIY7qeZIHidozwV24m4iavr15lNwIwLxRmOxhA==", + "license": "ISC", + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-dispatch": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/d3-dispatch/-/d3-dispatch-3.0.1.tgz", + "integrity": "sha512-rzUyPU/S7rwUflMyLc1ETDeBj0NRuHKKAcvukozwhshr6g6c5d8zh4c2gQjY2bZ0dXeGLWc1PF174P2tVvKhfg==", + "license": "ISC", + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-drag": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/d3-drag/-/d3-drag-3.0.0.tgz", + "integrity": "sha512-pWbUJLdETVA8lQNJecMxoXfH6x+mO2UQo8rSmZ+QqxcbyA3hfeprFgIT//HW2nlHChWeIIMwS2Fq+gEARkhTkg==", + "license": "ISC", + "dependencies": { + "d3-dispatch": "1 - 3", + "d3-selection": "3" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-ease": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/d3-ease/-/d3-ease-3.0.1.tgz", + "integrity": "sha512-wR/XK3D3XcLIZwpbvQwQ5fK+8Ykds1ip7A2Txe0yxncXSdq1L9skcG7blcedkOX+ZcgxGAmLX1FrRGbADwzi0w==", + "license": "BSD-3-Clause", + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-hierarchy": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/d3-hierarchy/-/d3-hierarchy-3.1.2.tgz", + "integrity": "sha512-FX/9frcub54beBdugHjDCdikxThEqjnR93Qt7PvQTOHxyiNCAlvMrHhclk3cD5VeAaq9fxmfRp+CnWw9rEMBuA==", + "license": "ISC", + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-interpolate": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/d3-interpolate/-/d3-interpolate-3.0.1.tgz", + "integrity": "sha512-3bYs1rOD33uo8aqJfKP3JWPAibgw8Zm2+L9vBKEHJ2Rg+viTR7o5Mmv5mZcieN+FRYaAOWX5SJATX6k1PWz72g==", + "license": "ISC", + "dependencies": { + "d3-color": "1 - 3" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-selection": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/d3-selection/-/d3-selection-3.0.0.tgz", + "integrity": "sha512-fmTRWbNMmsmWq6xJV8D19U/gw/bwrHfNXxrIN+HfZgnzqTHp9jOmKMhsTUjXOJnZOdZY9Q28y4yebKzqDKlxlQ==", + "license": "ISC", + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-timer": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/d3-timer/-/d3-timer-3.0.1.tgz", + "integrity": "sha512-ndfJ/JxxMd3nw31uyKoY2naivF+r29V+Lc0svZxe1JvvIRmi8hUsrMvdOwgS1o6uBHmiz91geQ0ylPP0aj1VUA==", + "license": "ISC", + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-transition": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/d3-transition/-/d3-transition-3.0.1.tgz", + "integrity": "sha512-ApKvfjsSR6tg06xrL434C0WydLr7JewBB3V+/39RMHsaXTOG0zmt/OAXeng5M5LBm0ojmxJrpomQVZ1aPvBL4w==", + "license": "ISC", + "dependencies": { + "d3-color": "1 - 3", + "d3-dispatch": "1 - 3", + "d3-ease": "1 - 3", + "d3-interpolate": "1 - 3", + "d3-timer": "1 - 3" + }, + "engines": { + "node": ">=12" + }, + "peerDependencies": { + "d3-selection": "2 - 3" + } + }, + "node_modules/d3-zoom": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/d3-zoom/-/d3-zoom-3.0.0.tgz", + "integrity": "sha512-b8AmV3kfQaqWAuacbPuNbL6vahnOJflOhexLzMMNLga62+/nh0JzvJ0aO/5a5MVgUFGS7Hu1P9P03o3fJkDCyw==", + "license": "ISC", + "dependencies": { + "d3-dispatch": "1 - 3", + "d3-drag": "2 - 3", + "d3-interpolate": "1 - 3", + "d3-selection": "2 - 3", + "d3-transition": "2 - 3" + }, + "engines": { + "node": ">=12" + } + }, "node_modules/data-urls": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/data-urls/-/data-urls-5.0.0.tgz", @@ -6118,6 +6356,13 @@ "node": ">=0.10.0" } }, + "node_modules/eventemitter3": { + "version": "4.0.7", + "resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-4.0.7.tgz", + "integrity": "sha512-8guHBZCwKnFhYdHr2ysuRWErTwhoN2X8XELRlrRwpmfeY2jjuUN4taQMsULKUVo1K4DvZl+0pgfyoysHxvmvEw==", + "dev": true, + "license": "MIT" + }, "node_modules/execa": { "version": "5.1.1", "resolved": "https://registry.npmjs.org/execa/-/execa-5.1.1.tgz", @@ -8519,6 +8764,16 @@ "dev": true, "license": "MIT" }, + "node_modules/oblivious-set": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/oblivious-set/-/oblivious-set-2.0.0.tgz", + "integrity": "sha512-QOUH5Xrsced9fKXaQTjWoDGKeS/Or7E2jB0FN63N4mkAO4qJdB7WR7e6qWAOHM5nk25FJ8TGjhP7DH4l6vFVLg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=16" + } + }, "node_modules/once": { "version": "1.4.0", "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", @@ -8563,6 +8818,16 @@ "node": ">= 0.8.0" } }, + "node_modules/p-finally": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/p-finally/-/p-finally-1.0.0.tgz", + "integrity": "sha512-LICb2p9CB7FS+0eR1oqWnHhp0FljGLZCWBE9aix0Uye9W8LTQPwMTYVGWQWIw9RdQiDg4+epXQODwIYJtSJaow==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, "node_modules/p-limit": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz", @@ -8595,6 +8860,36 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/p-queue": { + "version": "6.6.2", + "resolved": "https://registry.npmjs.org/p-queue/-/p-queue-6.6.2.tgz", + "integrity": "sha512-RwFpb72c/BhQLEXIZ5K2e+AhgNVmIejGlTgiB9MzZ0e93GRvqZ7uSi0dvRF7/XIXDeNkra2fNHBxTyPDGySpjQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "eventemitter3": "^4.0.4", + "p-timeout": "^3.2.0" + }, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/p-timeout": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/p-timeout/-/p-timeout-3.2.0.tgz", + "integrity": "sha512-rhIwUycgwwKcP9yTOOFK/AKsAopjjCakVqLHePO3CC6Mir1Z99xT+R63jZxAT5lFZLa2inS5h+ZS2GvR99/FBg==", + "dev": true, + "license": "MIT", + "dependencies": { + "p-finally": "^1.0.0" + }, + "engines": { + "node": ">=8" + } + }, "node_modules/p-try": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/p-try/-/p-try-2.2.0.tgz", @@ -9814,6 +10109,16 @@ "dev": true, "license": "MIT" }, + "node_modules/unload": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/unload/-/unload-2.4.1.tgz", + "integrity": "sha512-IViSAm8Z3sRBYA+9wc0fLQmU9Nrxb16rcDmIiR6Y9LJSZzI7QY5QsDhqPpKOjAn0O9/kfK1TfNEMMAGPTIraPw==", + "dev": true, + "license": "Apache-2.0", + "funding": { + "url": "https://github.com/sponsors/pubkey" + } + }, "node_modules/unrs-resolver": { "version": "1.12.2", "resolved": "https://registry.npmjs.org/unrs-resolver/-/unrs-resolver-1.12.2.tgz", @@ -10377,6 +10682,34 @@ "peerDependencies": { "zod": "^3.25.0 || ^4.0.0" } + }, + "node_modules/zustand": { + "version": "4.5.7", + "resolved": "https://registry.npmjs.org/zustand/-/zustand-4.5.7.tgz", + "integrity": "sha512-CHOUy7mu3lbD6o6LJLfllpjkzhHXSBlX8B9+qPddUsIfeF5S/UZ5q0kmCsnRqT1UHFQZchNFDDzMbQsuesHWlw==", + "license": "MIT", + "dependencies": { + "use-sync-external-store": "^1.2.2" + }, + "engines": { + "node": ">=12.7.0" + }, + "peerDependencies": { + "@types/react": ">=16.8", + "immer": ">=9.0.6", + "react": ">=16.8" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "immer": { + "optional": true + }, + "react": { + "optional": true + } + } } } } diff --git a/frontend/package.json b/frontend/package.json index e0005b754b..2dac7ae665 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -9,6 +9,7 @@ "preview": "vite preview", "lint": "eslint . --ext ts,tsx --report-unused-disable-directives --max-warnings 0", "type-check": "tsc --noEmit", + "type-check:contract": "tsc --noEmit -p tsconfig.contract.json", "start": "python dev.py start", "restart": "python dev.py restart", "stop": "python dev.py stop", @@ -26,7 +27,9 @@ "@azure/msal-react": "^5.4.3", "@fluentui/react-components": "9.74.1", "@fluentui/react-icons": "2.0.329", + "@xyflow/react": "^12.11.0", "axios": "1.17.0", + "d3-hierarchy": "^3.1.2", "react": "19.2.7", "react-dom": "19.2.7", "react-error-boundary": "6.1.2" @@ -38,6 +41,7 @@ "@testing-library/jest-dom": "6.9.1", "@testing-library/react": "16.3.2", "@testing-library/user-event": "14.6.1", + "@types/d3-hierarchy": "^3.1.7", "@types/jest": "30.0.0", "@types/node": "25.9.2", "@types/react": "19.2.17", @@ -45,6 +49,7 @@ "@typescript-eslint/eslint-plugin": "8.61.0", "@typescript-eslint/parser": "8.61.0", "@vitejs/plugin-react": "6.0.2", + "broadcast-channel": "^7.3.0", "esbuild": "0.28.1", "eslint": "10.4.1", "eslint-plugin-react-hooks": "7.1.1", diff --git a/frontend/src/App.test.tsx b/frontend/src/App.test.tsx index d3896fa297..a08968f1b4 100644 --- a/frontend/src/App.test.tsx +++ b/frontend/src/App.test.tsx @@ -21,6 +21,9 @@ jest.mock("./services/api", () => ({ createAttack: jest.fn(), deleteAttack: jest.fn(), }, + convertersApi: { + listConverters: jest.fn().mockResolvedValue({ items: [] }), + }, versionApi: { getVersion: jest.fn().mockResolvedValue({ version: "1.0.0" }), }, @@ -73,6 +76,9 @@ jest.mock("./components/Layout/MainLayout", () => { + {children} ); @@ -176,8 +182,10 @@ jest.mock("./components/Config/TargetConfig", () => { jest.mock("./components/History/AttackHistory", () => { const MockAttackHistory = ({ onOpenAttack, + onOpenAttackAsTree, }: { onOpenAttack: (attackResultId: string) => void; + onOpenAttackAsTree?: (attackResultId: string) => void; }) => { return (
@@ -193,6 +201,14 @@ jest.mock("./components/History/AttackHistory", () => { > Open Attack 2 + {onOpenAttackAsTree && ( + + )}
); }; @@ -203,6 +219,78 @@ jest.mock("./components/History/AttackHistory", () => { }; }); +jest.mock("./components/Tree/TreeRunnerHost", () => { + const React = jest.requireActual("react") as typeof import("react"); + const dirtyTree = { + id: "dirty-tree", + rootId: "root", + nodes: [ + { id: "root", kind: "root_prompt", parentId: null, state: "clean", params: {} }, + { id: "send-1", kind: "send", parentId: "root", state: "edited", params: {} }, + ], + parentConversationTreeId: null, + }; + const hasDirty = (tree: { nodes?: Array<{ state?: string }> } | null) => + tree?.nodes?.some((node) => node.state === "edited" || node.state === "draft") === true; + + const MockTreeRunnerHost = ({ + tree, + onTreeChange, + onGuardedSwapReady, + openFromAttackResultId, + }: { + tree: typeof dirtyTree | null; + onTreeChange?: (tree: typeof dirtyTree) => void; + onGuardedSwapReady?: (guardedSwap: (tree: typeof dirtyTree | null, swap: () => void) => void) => void; + openFromAttackResultId?: string | null; + }) => { + const [pendingSwap, setPendingSwap] = React.useState<(() => void) | null>(null); + const lastOpenedArIdRef = React.useRef(null); + + const guardedSwap = React.useCallback((candidateTree: typeof dirtyTree | null, swap: () => void) => { + if (hasDirty(candidateTree)) setPendingSwap(() => swap); + else swap(); + }, []); + + React.useEffect(() => { + onGuardedSwapReady?.(guardedSwap); + }, [guardedSwap, onGuardedSwapReady]); + + React.useEffect(() => { + if (!openFromAttackResultId) return; + if (lastOpenedArIdRef.current === openFromAttackResultId) return; + lastOpenedArIdRef.current = openFromAttackResultId; + onTreeChange?.({ ...dirtyTree, id: `opened-${openFromAttackResultId}`, nodes: [dirtyTree.nodes[0]] }); + }, [openFromAttackResultId, onTreeChange]); + + return ( +
+ + {openFromAttackResultId ?? "none"} + {hasDirty(tree) ? "yes" : "no"} + {pendingSwap && ( +
+ + +
+ )} +
+ ); + }; + MockTreeRunnerHost.displayName = "MockTreeRunnerHost"; + return { TreeRunnerHost: MockTreeRunnerHost }; +}); + jest.mock("./components/Home/Home", () => { const MockHome = ({ activeTarget, @@ -239,9 +327,14 @@ jest.mock("./components/Home/Home", () => { }); describe("App", () => { + const originalTreeFlag = process.env.VITE_ENABLE_TREE_UI; + beforeEach(() => { jest.clearAllMocks(); mockGetActiveAccount.mockReturnValue(null); + if (originalTreeFlag === undefined) delete process.env.VITE_ENABLE_TREE_UI; + else process.env.VITE_ENABLE_TREE_UI = originalTreeFlag; + window.history.replaceState(window.history.state, "", "/"); }); it("renders with FluentProvider and MainLayout", () => { @@ -372,6 +465,44 @@ describe("App", () => { expect(screen.getByTestId("attack-history")).toBeInTheDocument(); }); + it("guards Open as tree from History when the current tree has unsaved edits", async () => { + process.env.VITE_ENABLE_TREE_UI = "true"; + render(); + + fireEvent.click(screen.getByTestId("nav-tree")); + fireEvent.click(screen.getByTestId("make-dirty-tree")); + expect(screen.getByTestId("tree-is-dirty")).toHaveTextContent("yes"); + + fireEvent.click(screen.getByTestId("nav-history")); + fireEvent.click(screen.getByRole("button", { name: /discard and continue/i })); + expect(screen.getByTestId("attack-history")).toBeInTheDocument(); + + fireEvent.click(screen.getByTestId("open-attack-as-tree")); + + expect(screen.getByRole("dialog")).toBeInTheDocument(); + expect(screen.getByTestId("main-layout")).toHaveAttribute("data-current-view", "history"); + fireEvent.click(screen.getByRole("button", { name: /^cancel$/i })); + expect(screen.getByTestId("main-layout")).toHaveAttribute("data-current-view", "history"); + + fireEvent.click(screen.getByTestId("open-attack-as-tree")); + fireEvent.click(screen.getByRole("button", { name: /discard and continue/i })); + + await waitFor(() => { + expect(screen.getByTestId("main-layout")).toHaveAttribute("data-current-view", "tree"); + }); + expect(screen.getByTestId("open-from-ar-id")).toHaveTextContent("ar-tree-1"); + }); + + it("starts in tree view when the URL carries a conversation_tree_id fragment", () => { + process.env.VITE_ENABLE_TREE_UI = "true"; + window.history.replaceState(window.history.state, "", "/#conversation_tree_id=tree-from-url"); + + render(); + + expect(screen.getByTestId("main-layout")).toHaveAttribute("data-current-view", "tree"); + expect(screen.getByTestId("tree-runner-host")).toBeInTheDocument(); + }); + it("opens attack from history and switches to chat", async () => { mockGetAttack.mockResolvedValue({ attack_result_id: "ar-attack-1", conversation_id: "attack-conv-1", labels: { operator: "roakey" } }); render(); diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index 73b05f82e2..daebc54910 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -1,20 +1,28 @@ -import { useState, useCallback, useEffect } from 'react' -import { FluentProvider, webLightTheme, webDarkTheme } from '@fluentui/react-components' +import { useState, useCallback, useEffect, useMemo, useRef } from 'react' +import { FluentProvider, webLightTheme, webDarkTheme, MessageBar, MessageBarBody } from '@fluentui/react-components' import { useMsal } from '@azure/msal-react' import MainLayout from './components/Layout/MainLayout' import ChatWindow from './components/Chat/ChatWindow' import Home from './components/Home/Home' import TargetConfig from './components/Config/TargetConfig' import AttackHistory from './components/History/AttackHistory' +import { TreeRunnerHost } from './components/Tree/TreeRunnerHost' +import type { AvailableConvertersValue } from './components/Tree/availableConvertersContext' +import { guardedNavigate } from './components/Tree/navigationGuard' +import { useDirtyEditModal } from './components/Tree/useDirtyEditModal' +import { parseTreeIdFromUrlFragment } from './runner/workspacePersistence' import { DEFAULT_HISTORY_FILTERS } from './components/History/historyFilters' import type { HistoryFilters } from './components/History/historyFilters' import { ConnectionBanner } from './components/ConnectionBanner' import { ErrorBoundary } from './components/ErrorBoundary' import { ConnectionHealthProvider, useConnectionHealth } from './hooks/useConnectionHealth' import { DEFAULT_GLOBAL_LABELS } from './components/Labels/labelDefaults' +import { isTreeUiEnabled } from './featureFlags' +import { createRunWaveStarter } from './runner/runWaveStarter' +import type { ConversationTree } from './runner/treeTypes' import type { ViewName } from './components/Sidebar/Navigation' import type { TargetInstance, TargetInfo } from './types' -import { attacksApi, versionApi } from './services/api' +import { attacksApi, convertersApi, versionApi } from './services/api' const AUTO_DISMISS_MS = 5_000 @@ -42,8 +50,12 @@ function ConnectionBannerContainer() { function App() { const { instance } = useMsal() + const treeUiEnabled = isTreeUiEnabled() const [isDarkMode, setIsDarkMode] = useState(true) - const [currentView, setCurrentView] = useState('home') + const [currentView, setCurrentView] = useState(() => { + if (treeUiEnabled && parseTreeIdFromUrlFragment(window.location.hash) !== null) return 'tree' + return 'home' + }) const [activeTarget, setActiveTarget] = useState(null) const [globalLabels, setGlobalLabels] = useState>({ ...DEFAULT_GLOBAL_LABELS }) /** True while loading a historical attack from the history view */ @@ -181,6 +193,93 @@ function App() { setIsDarkMode(!isDarkMode) } + // --- Tree view (PR7i.2) — gated behind VITE_ENABLE_TREE_UI ----------------- + /** The foregrounded ConversationTree; null = greenfield. */ + const [currentTree, setCurrentTree] = useState(null) + /** + * Set when reload reconstructed a tree that had fan topology as a linear + * chain (PR7g slice-1 limitation). Surfaced as a one-line banner so the + * operator knows some structure isn't shown. Cleared on the next tree + * change. Removed when PR7g slice 2 lands fan-aware reload. + */ + const [treeReloadDegraded, setTreeReloadDegraded] = useState(null) + // Live operator-label mirror so the production runWaveStarter's operation + // provider re-reads the latest value without rebuilding the shim. + const globalLabelsRef = useRef(globalLabels) + useEffect(() => { + globalLabelsRef.current = globalLabels + }, [globalLabels]) + // Production wave dispatcher: bridges the shim to runWave + the real API. + // The operation closure reads globalLabelsRef lazily (per wave, in an event + // handler) — not during render — so the react-hooks/refs flag is a + // false positive here. + /* eslint-disable react-hooks/refs */ + const treeRunWaveStarter = useMemo( + () => + createRunWaveStarter({ + api: attacksApi, + operation: () => globalLabelsRef.current.operation, + }), + [], + ) + /* eslint-enable react-hooks/refs */ + + // PR7h/PR7i.3: the tree host's dirty-edit guardedSwap, captured via + // onGuardedSwapReady. Navigating away from the tree view routes through it + // so unrefreshed edits prompt a confirm (spec §13.1a). Held in a ref since + // the host fires the callback after mount. + const treeGuardedSwapRef = useRef< + ((tree: ConversationTree | null, swap: () => void) => void) | null + >(null) + const { guardedSwap: guardedOpenTreeSwap, modalElement: openTreeDirtyModalElement } = + useDirtyEditModal() + const handleNavigate = useCallback( + (target: ViewName) => { + guardedNavigate({ + currentView, + target, + tree: currentTree, + guardedSwap: treeGuardedSwapRef.current, + navigate: setCurrentView, + }) + }, + [currentView, currentTree], + ) + + /** AR id to auto-reverse into the tree view (spec §5.12 "Open as tree"). */ + const [openTreeFromArId, setOpenTreeFromArId] = useState(null) + const [availableTreeConverters, setAvailableTreeConverters] = useState(null) + useEffect(() => { + if (!treeUiEnabled) return + let cancelled = false + ;(async () => { + try { + const response = await convertersApi.listConverters() + if (cancelled) return + setAvailableTreeConverters( + response.items.map((converter) => ({ + id: converter.converter_id, + label: converter.display_name ?? converter.converter_type ?? converter.converter_id, + })), + ) + } catch { + if (!cancelled) setAvailableTreeConverters(null) + } + })() + return () => { + cancelled = true + } + }, [treeUiEnabled]) + const handleOpenAttackAsTree = useCallback( + (arId: string) => { + guardedOpenTreeSwap(currentTree, () => { + setOpenTreeFromArId(arId) + setCurrentView('tree') + }) + }, + [currentTree, guardedOpenTreeSwap], + ) + return ( @@ -188,7 +287,7 @@ function App() { @@ -213,6 +312,7 @@ function App() { labels={globalLabels} onLabelsChange={setGlobalLabels} onNavigate={setCurrentView} + onOpenAttackAsTree={treeUiEnabled ? handleOpenAttackAsTree : undefined} attackLabels={attackLabels} attackTarget={attackTarget} isLoadingAttack={isLoadingAttack} @@ -228,11 +328,42 @@ function App() { {currentView === 'history' && ( )} + {treeUiEnabled && currentView === 'tree' && ( + <> + {treeReloadDegraded !== null && ( + + + This tree was reconstructed as a linear chain on reload;{' '} + {treeReloadDegraded} fan{treeReloadDegraded === 1 ? '' : 's'} from the saved + tree {treeReloadDegraded === 1 ? 'is' : 'are'} not shown. Re-run a refresh to + rebuild the full structure. + + + )} + { + setTreeReloadDegraded(null) + setCurrentTree(next) + }} + onReconstructionDegraded={(info) => setTreeReloadDegraded(info.fanCount)} + onGuardedSwapReady={(guardedSwap) => { + treeGuardedSwapRef.current = guardedSwap + }} + /> + + )} + {openTreeDirtyModalElement} diff --git a/frontend/src/components/Chat/ChatWindow.tsx b/frontend/src/components/Chat/ChatWindow.tsx index 3ee4d27005..86a75a78b4 100644 --- a/frontend/src/components/Chat/ChatWindow.tsx +++ b/frontend/src/components/Chat/ChatWindow.tsx @@ -4,7 +4,7 @@ import { Text, Tooltip, } from '@fluentui/react-components' -import { AddRegular, PanelRightRegular } from '@fluentui/react-icons' +import { AddRegular, BranchForkRegular, PanelRightRegular } from '@fluentui/react-icons' import MessageList from './MessageList' import ChatInputArea from './ChatInputArea' import ConversationPanel from './ConversationPanel' @@ -32,6 +32,7 @@ interface ChatWindowProps { labels?: Record onLabelsChange?: (labels: Record) => void onNavigate?: (view: ViewName) => void + onOpenAttackAsTree?: (attackResultId: string) => void /** Labels from the loaded attack (for operator locking). Null for new attacks. */ attackLabels?: Record | null /** Target info that the current attack was started with (for cross-target guard). */ @@ -53,6 +54,7 @@ export default function ChatWindow({ labels, onLabelsChange, onNavigate, + onOpenAttackAsTree, attackLabels, attackTarget, isLoadingAttack, @@ -577,6 +579,20 @@ export default function ChatWindow({ )}
+ {onOpenAttackAsTree && ( + +
) : ( - + )} diff --git a/frontend/src/components/History/AttackTable.test.tsx b/frontend/src/components/History/AttackTable.test.tsx index 03f7ec6063..b638b8b195 100644 --- a/frontend/src/components/History/AttackTable.test.tsx +++ b/frontend/src/components/History/AttackTable.test.tsx @@ -187,6 +187,33 @@ describe('AttackTable', () => { expect(onOpenAttack).toHaveBeenCalledWith('ar-2') }) + it('does NOT render the "Open as tree" button when onOpenAttackAsTree is absent', () => { + render( + + + + ) + expect(screen.queryByTestId('open-attack-as-tree-ar-1')).toBeNull() + }) + + it('renders an "Open as tree" button when onOpenAttackAsTree is provided; clicking fires it', () => { + const onOpenAttackAsTree = jest.fn() + const onOpenAttack = jest.fn() + render( + + + + ) + fireEvent.click(screen.getByTestId('open-attack-as-tree-ar-2')) + expect(onOpenAttackAsTree).toHaveBeenCalledWith('ar-2') + // The row-level open (chat) must not also fire. + expect(onOpenAttack).not.toHaveBeenCalled() + }) + it('should render outcome badges with correct text', () => { render( diff --git a/frontend/src/components/History/AttackTable.tsx b/frontend/src/components/History/AttackTable.tsx index 906be3538b..fedfa5dc6a 100644 --- a/frontend/src/components/History/AttackTable.tsx +++ b/frontend/src/components/History/AttackTable.tsx @@ -13,6 +13,7 @@ import { } from '@fluentui/react-components' import { OpenRegular, + BranchRegular, CheckmarkCircleRegular, DismissCircleRegular, QuestionCircleRegular, @@ -38,10 +39,16 @@ const OUTCOME_COLORS: Record void + /** + * Optional "Open as tree" action (spec §5.12). Renders a second row + * button next to "Open attack" when provided; App only passes it when the + * tree-UI feature flag is on, so the button is implicitly flag-gated. + */ + onOpenAttackAsTree?: (attackResultId: string) => void formatDate: (dateStr: string) => string } -export default function AttackTable({ attacks, onOpenAttack, formatDate }: AttackTableProps) { +export default function AttackTable({ attacks, onOpenAttack, onOpenAttackAsTree, formatDate }: AttackTableProps) { const styles = useAttackHistoryStyles() return ( @@ -167,18 +174,34 @@ export default function AttackTable({ attacks, onOpenAttack, formatDate }: Attac - - + + + + + + ) +} diff --git a/frontend/src/components/Tree/CostGuardrailModal.tsx b/frontend/src/components/Tree/CostGuardrailModal.tsx new file mode 100644 index 0000000000..42315bec80 --- /dev/null +++ b/frontend/src/components/Tree/CostGuardrailModal.tsx @@ -0,0 +1,110 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +/** + * Cost-guardrail confirmation modal per spec §8.1. Pure presentational + * — owns only the "Don't ask again this session" checkbox draft state; + * commit-on-Refresh, discard-on-Cancel. + */ + +import { useState } from 'react' +import { + Button, + Checkbox, + Dialog, + DialogActions, + DialogBody, + DialogContent, + DialogSurface, + DialogTitle, +} from '@fluentui/react-components' + +import type { WaveTriggerKind } from '../../runner/treeTypes' + +export interface CostGuardrailModalProps { + count: number + triggerKind: WaveTriggerKind + threshold: number + /** Resolves the runner's approve() Promise to true. */ + onRefresh: (commitSuppression: boolean) => void + /** Resolves to false; suppression draft discarded. */ + onCancel: () => void +} + +export function CostGuardrailModal({ + count, + triggerKind, + threshold, + onRefresh, + onCancel, +}: CostGuardrailModalProps) { + const [dontAskAgain, setDontAskAgain] = useState(false) + const bodyClause = bodyClauseFor(triggerKind) + return ( + { + if (!data.open) onCancel() + }} + > + + + {titleFor(triggerKind, count)} + +

+ {bodyClause} will send {count} call{count === 1 ? '' : 's'} to the target + (threshold: {threshold} call{threshold === 1 ? '' : 's'} per wave). +

+ setDontAskAgain(data.checked === true)} + label="Don't ask again this session" + /> +
+ + + + +
+
+
+ ) +} + +function titleFor(kind: WaveTriggerKind, count: number): string { + switch (kind) { + case 'refresh_tree': + return `Refresh tree (${count} call${count === 1 ? '' : 's'})?` + case 'refresh_subtree': + return `Refresh subtree (${count} call${count === 1 ? '' : 's'})?` + case 'refresh_node': + return `Refresh node (${count} call${count === 1 ? '' : 's'})?` + case 'retry_failed': + return `Retry failed (${count} call${count === 1 ? '' : 's'})?` + case 'synced_peer_add': + return `Add synced peer (${count} call${count === 1 ? '' : 's'})?` + case 'cross_tree_rebase': + return `Cross-tree rebase (${count} call${count === 1 ? '' : 's'})?` + } +} + +function bodyClauseFor(kind: WaveTriggerKind): string { + switch (kind) { + case 'refresh_tree': + return 'Refreshing the tree' + case 'refresh_subtree': + return 'Refreshing this subtree' + case 'refresh_node': + return 'Refreshing this node' + case 'retry_failed': + return 'Retrying failed nodes' + case 'synced_peer_add': + return 'Adding a synced peer' + case 'cross_tree_rebase': + return 'Performing a cross-tree rebase' + } +} diff --git a/frontend/src/components/Tree/DirtyEditModal.tsx b/frontend/src/components/Tree/DirtyEditModal.tsx new file mode 100644 index 0000000000..6e7d157387 --- /dev/null +++ b/frontend/src/components/Tree/DirtyEditModal.tsx @@ -0,0 +1,61 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +/** + * Dirty-edit confirmation modal per spec 01 §13.1a. Pure presentational + * — shown when an in-app tree swap (Switch tree / new / close) is + * attempted while the current tree carries unrefreshed edits. The + * operator either discards the edits and continues, or cancels and + * stays to Refresh first. + */ + +import { + Button, + Dialog, + DialogActions, + DialogBody, + DialogContent, + DialogSurface, + DialogTitle, +} from '@fluentui/react-components' + +export interface DirtyEditModalProps { + /** Count of unrefreshed (`edited` / `draft`) nodes in the current tree. */ + count: number + /** Operator confirmed discard — run the deferred swap. */ + onDiscard: () => void + /** Operator cancelled — keep the current tree, abandon the swap. */ + onCancel: () => void +} + +export function DirtyEditModal({ count, onDiscard, onCancel }: DirtyEditModalProps) { + return ( + { + if (!data.open) onCancel() + }} + > + + + Discard unsaved edits? + +

+ You have {count} unsaved edit{count === 1 ? '' : 's'} that will be lost when + switching trees. Refresh the tree first to persist them as AttackResults, or + continue to discard. +

+
+ + + + +
+
+
+ ) +} diff --git a/frontend/src/components/Tree/FanCard.tsx b/frontend/src/components/Tree/FanCard.tsx new file mode 100644 index 0000000000..b0b6d16da2 --- /dev/null +++ b/frontend/src/components/Tree/FanCard.tsx @@ -0,0 +1,253 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +import { + Button, + Menu, + MenuItem, + MenuList, + MenuPopover, + MenuTrigger, + Tooltip, +} from '@fluentui/react-components' +import { + ArrowMaximizeRegular, + ArrowMinimizeRegular, + CheckmarkCircleFilled, + CheckmarkCircleRegular, +} from '@fluentui/react-icons' +import type { NodeProps } from '@xyflow/react' + +import type { + ConversationTreeNodeId, + FanNode, +} from '../../runner/treeTypes' +import { useActionCallbacks } from './actionCallbacksContext' +import { useStackCollapse } from './stackCollapseContext' +import type { StackAggregate, StackMember } from './fanStack' +import type { TreeFlowNode } from './conversationTreeToReactFlow' +import { CardFrame, MetaRow } from './cardFrame' +import { useNodeCardStyles } from './nodeCards.styles' + +type FanProps = NodeProps> + +export function FanCard({ data, selected }: FanProps) { + const node: FanNode = data.node + const n = node.params.variants.length + const stack = data.stackedSummary + const collapseCtx = useStackCollapse() + const callbacks = useActionCallbacks() + const onPickFanChild = callbacks?.onPickFanChild + const onPruneFanToPickedPath = callbacks?.onPruneFanToPickedPath + return ( + + + + {node.params.promotedChildSlotIndex !== null && ( + + )} + {stack !== undefined && } + {stack !== undefined && onPickFanChild !== undefined && ( + + )} + {onPruneFanToPickedPath !== undefined && ( + + )} + {collapseCtx !== null && ( + collapseCtx.toggleStack(node.id)} + /> + )} + + ) +} + +function PruneFanButton({ + fanNodeId, + variants, + promotedSlot, + onPruneFanToPickedPath, +}: { + fanNodeId: ConversationTreeNodeId + variants: number + promotedSlot: number | null + onPruneFanToPickedPath: (id: ConversationTreeNodeId, slotIndex: number) => void +}) { + if (promotedSlot !== null) { + return ( +
+ + + +
+ ) + } + + return ( +
+ + + + + + + + + {Array.from({ length: variants }, (_unused, slot) => ( + onPruneFanToPickedPath(fanNodeId, slot)}> + Keep slot {slot} + + ))} + + + +
+ ) +} + +/** + * Inline body shown inside the FanCard when the fan is in the collapsed + * (stacked) state. Renders the multiplicity ("Send ×10") and aggregate + * status ("9 ✓, 1 ⚠") so operators see at a glance how the stacked + * children are doing. + */ +function StackSummaryBody({ summary }: { summary: StackAggregate }) { + const styles = useNodeCardStyles() + const successful = summary.byState.clean + const running = summary.byState.running + const failed = summary.byState.failed + summary.byState.cancelled + const pending = + summary.byState.draft + + summary.byState.edited + + summary.byState.stale + const parts: string[] = [] + if (successful > 0) parts.push(`${successful} ✓`) + if (running > 0) parts.push(`${running} ●`) + if (failed > 0) parts.push(`${failed} ⚠`) + if (pending > 0) parts.push(`${pending} ⧖`) + const statusLine = parts.length > 0 ? parts.join(', ') : '—' + const kindLabel = summary.childKind ?? 'item' + return ( +
+ + {kindLabel} ×{summary.total} + + {statusLine} +
+ ) +} + +function StackToggleButton({ + collapsed, + onToggle, +}: { + collapsed: boolean + onToggle: () => void +}) { + const label = collapsed ? 'Expand stack' : 'Collapse to stack' + return ( +
+ +
+ ) +} + +/** + * Collapsed-stack Pick popover. The operator-friendly alternative to + * "expand the stack first, then click each child's Pick icon" (which + * was the four-clicks-per-decision flow the PR5f reviewer flagged as + * unusable for the dominant workflow). Lists each member as a Fluent + * MenuItem with a per-state glyph + slot number; the currently-promoted + * member shows "(picked)" and clicking it unpicks (null). + */ +function StackPickButton({ + fanNodeId, + members, + promotedSlot, + onPickFanChild, +}: { + fanNodeId: ConversationTreeNodeId + members: ReadonlyArray + promotedSlot: number | null + onPickFanChild: (id: ConversationTreeNodeId, slotIndex: number | null) => void +}) { + return ( +
+ + + + + + + + + {members.map((m) => { + const isPromoted = promotedSlot !== null && promotedSlot === m.slotIndex + const next = isPromoted ? null : m.slotIndex + const label = `slot ${m.slotIndex} (${m.state})${isPromoted ? ' ✓ (picked)' : ''}` + return ( + : } + onClick={() => onPickFanChild(fanNodeId, next)} + > + {label} + + ) + })} + + + +
+ ) +} diff --git a/frontend/src/components/Tree/ImportMessageCard.tsx b/frontend/src/components/Tree/ImportMessageCard.tsx new file mode 100644 index 0000000000..fd204cb3c7 --- /dev/null +++ b/frontend/src/components/Tree/ImportMessageCard.tsx @@ -0,0 +1,28 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +import type { NodeProps } from '@xyflow/react' + +import type { ImportMessageNode } from '../../runner/treeTypes' +import type { TreeFlowNode } from './conversationTreeToReactFlow' +import { CardFrame, MetaRow } from './cardFrame' + +type ImportMessageProps = NodeProps> + +export function ImportMessageCard({ data, selected }: ImportMessageProps) { + const node: ImportMessageNode = data.node + return ( + + + + + ) +} diff --git a/frontend/src/components/Tree/InsertEdge.tsx b/frontend/src/components/Tree/InsertEdge.tsx new file mode 100644 index 0000000000..1aa84c054b --- /dev/null +++ b/frontend/src/components/Tree/InsertEdge.tsx @@ -0,0 +1,275 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +/** + * Per-edge insert chip + popover. Custom react-flow edge component that + * extends the smoothstep path with a `+` button at the midpoint; + * clicking the chip opens a kind-aware Fluent Menu of insert options. + * + * Chip visibility is gated on the host having supplied an + * `onEdgeInsert` callback (via ActionCallbacksContext) AND the parent + * being a kind that admits any legal insert (Score and Fan parents + * suppress the chip — see PARENTS_WITHOUT_INSERT below). + */ + +import { useMemo, useState } from 'react' +import { + BaseEdge, + EdgeLabelRenderer, + getSmoothStepPath, + useStore, +} from '@xyflow/react' +import type { EdgeProps } from '@xyflow/react' +import { + Button, + Menu, + MenuItem, + MenuList, + MenuPopover, + MenuTrigger, + Tooltip, +} from '@fluentui/react-components' +import { AddRegular } from '@fluentui/react-icons' + +import { useActionCallbacks } from './actionCallbacksContext' +import type { EdgeInsertKind } from './actionRail' +import type { TreeFlowEdge } from './conversationTreeToReactFlow' +import { useInsertEdgeStyles } from './insertEdge.styles' +import type { ConversationTreeNodeId, ConversationTreeNodeKind } from '../../runner/treeTypes' + +// Parents whose edges do NOT show the chip. Score is terminal (no +// post-Score insert in V1.0); Fan children are managed via the FanCard's +// own `+` (add variant) button, not via the edge below the Fan. +const PARENTS_WITHOUT_INSERT: ReadonlySet = new Set([ + 'score', + 'fan', +]) + +/** + * Discriminated union: disabled items deliberately have no `kind` + * field. A V1.1-disabled item that would otherwise be tempted to + * mint a placeholder `kind` (then silently dispatch the wrong axis + * once the flag flips) cannot — the type system rejects it. Enabling + * the item requires also picking a real `kind` at the same change. + */ +export type InsertMenuOption = + | { + readonly disabled: false + readonly kind: EdgeInsertKind + readonly label: string + } + | { + readonly disabled: true + readonly label: string + readonly disabledReason: string + } + +interface InsertMenu { + basic: InsertMenuOption[] + fanAxes: ReadonlyArray // submenu items +} + +/** + * Per-parent menu. The legal next-node types depend on the upstream + * node's kind — surfacing only the legal options is cheaper than + * showing all + erroring on commit. + */ +function menuForParent(parentKind: ConversationTreeNodeKind): InsertMenu | null { + switch (parentKind) { + case 'root_prompt': + return { + basic: [ + { disabled: false, kind: 'follow_up_user_turn', label: 'Follow-up user message' }, + { disabled: false, kind: 'inject_assistant_text', label: 'Inject assistant text' }, + { disabled: false, kind: 'send', label: 'Add response' }, + ], + fanAxes: V1_0_FAN_AXES, + } + case 'import_message': + return { + basic: [ + { disabled: false, kind: 'follow_up_user_turn', label: 'Follow-up user message' }, + { disabled: false, kind: 'inject_assistant_text', label: 'Inject assistant text' }, + { disabled: false, kind: 'send', label: 'Add response' }, + ], + fanAxes: V1_0_FAN_AXES, + } + case 'user_turn': + return { + basic: [ + { disabled: false, kind: 'send', label: 'Add response' }, + { disabled: false, kind: 'append_converter', label: 'Append converter' }, + ], + fanAxes: [ + { disabled: false, kind: 'fan_converter', label: 'Fan out: converter' }, + ], + } + case 'converter': + return { + basic: [ + { disabled: false, kind: 'send', label: 'Add response' }, + ], + fanAxes: [], + } + case 'send': + return { + basic: [ + { disabled: false, kind: 'follow_up_user_turn', label: 'Follow-up user message' }, + { disabled: false, kind: 'inject_assistant_text', label: 'Inject assistant text' }, + { disabled: false, kind: 'score', label: 'Score' }, + ], + fanAxes: V1_0_FAN_AXES, + } + case 'score': + case 'fan': + return null + } +} + +const V1_0_FAN_AXES: ReadonlyArray = [ + { disabled: false, kind: 'fan_attempt', label: 'Fan out: attempt' }, + { disabled: false, kind: 'fan_converter', label: 'Fan out: converter' }, +] + +export function InsertEdge({ + id, + source, + target, + sourceX, + sourceY, + sourcePosition, + targetX, + targetY, + targetPosition, + data, + style, + markerEnd, +}: EdgeProps) { + const callbacks = useActionCallbacks() + const styles = useInsertEdgeStyles() + const [open, setOpen] = useState(false) + // EdgeLabelRenderer portals into the `.react-flow__edgelabel-renderer` + // div, which exists only inside the full render tree (NOT + // inside a bare ReactFlowProvider). When testing the edge directly (no + // mounted), the portal target is absent and the chip falls + // back to rendering inline. Production always has the portal target; + // the visual is the same either way. + const hasPortalTarget = useStore((s) => Boolean(s.domNode?.querySelector('.react-flow__edgelabel-renderer'))) + + const parentKind = data?.parentKind + const menu = useMemo( + () => (parentKind !== undefined ? menuForParent(parentKind) : null), + [parentKind], + ) + + const [edgePath, labelX, labelY] = getSmoothStepPath({ + sourceX, + sourceY, + sourcePosition, + targetX, + targetY, + targetPosition, + }) + + const showChip = + callbacks?.onEdgeInsert !== undefined && + parentKind !== undefined && + !PARENTS_WITHOUT_INSERT.has(parentKind) && + menu !== null + + if (!showChip) { + return + } + + const onEdgeInsert = callbacks!.onEdgeInsert! + const handleSelect = (kind: EdgeInsertKind) => { + // react-flow's EdgeProps types source/target as plain strings; brand + // them back to ConversationTreeNodeId at the callback boundary so + // hosts get the same type the runner uses everywhere else. + onEdgeInsert( + source as ConversationTreeNodeId, + target as ConversationTreeNodeId, + kind, + ) + setOpen(false) + } + + const chip = ( +
+ setOpen(d.open)} positioning="below"> + + + +
+ ) + + return ( + <> + + {hasPortalTarget ? {chip} : chip} + + ) +} + +function parentLabel(kind: ConversationTreeNodeKind): string { + switch (kind) { + case 'root_prompt': + return 'root prompt' + case 'import_message': + return 'imported message' + case 'user_turn': + return 'user turn' + case 'converter': + return 'converter' + case 'send': + return 'response' + case 'fan': + return 'fan' + case 'score': + return 'score' + } +} diff --git a/frontend/src/components/Tree/PastRunsDrawer.styles.ts b/frontend/src/components/Tree/PastRunsDrawer.styles.ts new file mode 100644 index 0000000000..3a09f76e44 --- /dev/null +++ b/frontend/src/components/Tree/PastRunsDrawer.styles.ts @@ -0,0 +1,62 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +import { makeStyles, tokens } from '@fluentui/react-components' + +export const usePastRunsDrawerStyles = makeStyles({ + drawer: { + display: 'flex', + flexDirection: 'column', + gap: tokens.spacingVerticalXS, + padding: tokens.spacingVerticalS, + }, + empty: { + color: tokens.colorNeutralForeground3, + fontStyle: 'italic', + margin: 0, + }, + entry: { + display: 'flex', + alignItems: 'center', + gap: tokens.spacingHorizontalS, + padding: `${tokens.spacingVerticalXS} ${tokens.spacingHorizontalS}`, + border: `1px solid ${tokens.colorNeutralStroke2}`, + borderRadius: tokens.borderRadiusSmall, + backgroundColor: tokens.colorNeutralBackground1, + fontSize: tokens.fontSizeBase200, + '&[data-current="true"]': { + borderLeftWidth: '3px', + borderLeftColor: tokens.colorBrandStroke1, + }, + '&[data-pinned="true"]': { + backgroundColor: tokens.colorNeutralBackground2, + }, + }, + outcome: { + fontFamily: tokens.fontFamilyMonospace, + }, + id: { + fontFamily: tokens.fontFamilyMonospace, + color: tokens.colorNeutralForeground2, + }, + timestamp: { + color: tokens.colorNeutralForeground3, + fontSize: tokens.fontSizeBase100, + }, + wave: { + fontFamily: tokens.fontFamilyMonospace, + color: tokens.colorNeutralForeground3, + fontSize: tokens.fontSizeBase100, + }, + currentTag: { + color: tokens.colorBrandForeground1, + fontWeight: tokens.fontWeightSemibold, + fontSize: tokens.fontSizeBase100, + textTransform: 'uppercase', + }, + actions: { + marginLeft: 'auto', + display: 'flex', + gap: tokens.spacingHorizontalXXS, + }, +}) diff --git a/frontend/src/components/Tree/PastRunsDrawer.test.tsx b/frontend/src/components/Tree/PastRunsDrawer.test.tsx new file mode 100644 index 0000000000..e1e438c000 --- /dev/null +++ b/frontend/src/components/Tree/PastRunsDrawer.test.tsx @@ -0,0 +1,318 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +/** + * Tests for `PastRunsDrawer` — the per-node "Past runs" drawer tab + * per spec §2.3 (right-side drawer slides in when a node is selected). + * + * Renders the node's current `execution` (if any) + reverse-chrono + * `executionHistory[]` reflog entries. Each entry shows attempt + * timestamp, outcome, waveId suffix, and per-entry actions: + * - pin / unpin (operator-meaningful preservation; runner's + * `setReflogPinned` is the wire) + * - checkout (swap this past run back into the `execution` slot; + * PR6e ships the button + callback contract; the host's + * `makeCurrent` plumbing is V1.x) + */ + +import { fireEvent, render, screen, within } from '@testing-library/react' + +import { PastRunsDrawer } from './PastRunsDrawer' +import type { + ConversationTreeNodeId, + ExecutionRecord, + ReflogEntry, +} from '../../runner/treeTypes' +import { nodeId } from '../../runner/testHelpers' + +// ============================================================================ +// Helpers +// ============================================================================ + +function mkExec( + id: string, + attemptedAt: string, + outcome: ExecutionRecord['outcome'] = 'success', +): ExecutionRecord { + return { + executionId: id, + attemptedAt, + attackResultId: `ar-${id}`, + conversationId: `c-${id}`, + pieceIds: [], + outcome, + resolvedInputHashAtExecution: 'h', + waveId: `wave-${id}`, + waveTriggerKind: 'refresh_tree', + dispatchedAt: attemptedAt, + targetFirstByteAt: null, + completedAt: attemptedAt, + } +} + +function mkEntry(id: string, attemptedAt: string, pinned = false): ReflogEntry { + return { execution: mkExec(id, attemptedAt), pinned } +} + +const N: ConversationTreeNodeId = nodeId('node-1') + +// ============================================================================ +// Empty / no execution +// ============================================================================ + +describe('PastRunsDrawer — empty state', () => { + it('renders an empty-state hint when execution is null AND history is empty', () => { + const { container } = render( + , + ) + expect((container.textContent ?? '').toLowerCase()).toMatch(/no past runs|no executions|nothing here/i) + }) + + it('renders nothing actionable when no callbacks are wired', () => { + const { container } = render( + , + ) + // No pin/unpin/checkout buttons when no callbacks supplied. + expect(screen.queryByRole('button', { name: /pin|unpin|checkout/i })).toBeNull() + // Current execution still rendered. + expect((container.textContent ?? '')).toMatch(/e1/) + }) +}) + +// ============================================================================ +// Current + history rendering +// ============================================================================ + +describe('PastRunsDrawer — render entries', () => { + it('renders the current execution as the topmost entry, marked current', () => { + const current = mkExec('e_current', '2026-06-11T12:00:00Z') + const history: ReflogEntry[] = [ + mkEntry('e_older', '2026-06-11T11:00:00Z'), + mkEntry('e_oldest', '2026-06-11T10:00:00Z'), + ] + const { container } = render( + , + ) + const entries = container.querySelectorAll('[data-tree-reflog-entry]') + expect(entries).toHaveLength(3) + // First (newest) entry is the current execution. + expect(entries[0].getAttribute('data-execution-id')).toBe('e_current') + expect(entries[0].getAttribute('data-current')).toBe('true') + // Subsequent entries are historical. + expect(entries[1].getAttribute('data-execution-id')).toBe('e_older') + expect(entries[1].getAttribute('data-current')).toBe('false') + }) + + it('renders history reverse-chronologically as supplied (host owns ordering)', () => { + // PastRunsDrawer doesn't sort — it trusts the host's ordering so a + // host-side filter (e.g., pinned-first) can shape the list. Test + // pins the contract: the iteration order matches input order. + const history: ReflogEntry[] = [ + mkEntry('e_a', '2026-06-11T11:00:00Z'), + mkEntry('e_b', '2026-06-11T09:00:00Z'), + mkEntry('e_c', '2026-06-11T10:00:00Z'), + ] + const { container } = render( + , + ) + const entries = container.querySelectorAll('[data-tree-reflog-entry]') + expect(entries).toHaveLength(3) + expect([ + entries[0].getAttribute('data-execution-id'), + entries[1].getAttribute('data-execution-id'), + entries[2].getAttribute('data-execution-id'), + ]).toEqual(['e_a', 'e_b', 'e_c']) + }) + + it('shows entry outcome glyph (✓ for success, ⚠ for failure)', () => { + const history: ReflogEntry[] = [ + { execution: mkExec('e_pass', '2026-06-11T11:00:00Z', 'success'), pinned: false }, + { execution: mkExec('e_fail', '2026-06-11T10:00:00Z', 'failure'), pinned: false }, + ] + const { container } = render( + , + ) + const text = container.textContent ?? '' + expect(text).toMatch(/\u2713/) // ✓ + expect(text).toMatch(/\u26A0/) // ⚠ + }) + + it('shows the ⦾ glyph for cancelled outcome', () => { + const history: ReflogEntry[] = [ + { execution: mkExec('e_cancel', '2026-06-11T11:00:00Z', 'cancelled'), pinned: false }, + ] + const { container } = render( + , + ) + expect(container.textContent ?? '').toMatch(/\u29BE/) // ⦾ + }) + + it('shows the ● glyph for pending outcome', () => { + const history: ReflogEntry[] = [ + { execution: mkExec('e_pend', '2026-06-11T11:00:00Z', 'pending'), pinned: false }, + ] + const { container } = render( + , + ) + expect(container.textContent ?? '').toMatch(/\u25CF/) // ● + }) +}) + +// ============================================================================ +// Pin / unpin +// ============================================================================ + +describe('PastRunsDrawer — pin/unpin', () => { + it('renders a Pin button on an unpinned entry; clicking fires onTogglePin(executionId, true)', () => { + const onTogglePin = jest.fn() + const history: ReflogEntry[] = [mkEntry('e_x', '2026-06-11T10:00:00Z', false)] + const { container } = render( + , + ) + const entry = container.querySelector('[data-execution-id="e_x"]') as HTMLElement + fireEvent.click(within(entry).getByRole('button', { name: /^pin$/i })) + expect(onTogglePin).toHaveBeenCalledWith('e_x', true) + }) + + it('renders an Unpin button on a pinned entry; clicking fires onTogglePin(executionId, false)', () => { + const onTogglePin = jest.fn() + const history: ReflogEntry[] = [mkEntry('e_x', '2026-06-11T10:00:00Z', true)] + const { container } = render( + , + ) + const entry = container.querySelector('[data-execution-id="e_x"]') as HTMLElement + fireEvent.click(within(entry).getByRole('button', { name: /^unpin$/i })) + expect(onTogglePin).toHaveBeenCalledWith('e_x', false) + }) + + it('current execution does NOT show a pin button (current is implicitly preserved)', () => { + const current = mkExec('e_current', '2026-06-11T12:00:00Z') + const { container } = render( + , + ) + const entry = container.querySelector('[data-execution-id="e_current"]') as HTMLElement + expect(within(entry).queryByRole('button', { name: /pin|unpin/i })).toBeNull() + }) +}) + +// ============================================================================ +// Checkout +// ============================================================================ + +describe('PastRunsDrawer — checkout', () => { + it('renders a Checkout button on past entries; clicking fires onCheckout(executionId)', () => { + const onCheckout = jest.fn() + const history: ReflogEntry[] = [mkEntry('e_old', '2026-06-11T10:00:00Z')] + const { container } = render( + , + ) + const entry = container.querySelector('[data-execution-id="e_old"]') as HTMLElement + fireEvent.click(within(entry).getByRole('button', { name: /checkout/i })) + expect(onCheckout).toHaveBeenCalledWith('e_old') + }) + + it('current execution does NOT show a Checkout button (you cannot checkout to yourself)', () => { + const current = mkExec('e_current', '2026-06-11T12:00:00Z') + const { container } = render( + , + ) + const entry = container.querySelector('[data-execution-id="e_current"]') as HTMLElement + expect(within(entry).queryByRole('button', { name: /checkout/i })).toBeNull() + }) +}) + +// ============================================================================ +// Pin marker affordance — pinned entries are visually distinguished +// ============================================================================ + +describe('PastRunsDrawer — pin marker', () => { + it('pinned entries carry data-pinned="true" for downstream styling', () => { + const history: ReflogEntry[] = [ + mkEntry('e_pin', '2026-06-11T11:00:00Z', true), + mkEntry('e_unpin', '2026-06-11T10:00:00Z', false), + ] + const { container } = render( + , + ) + expect( + container.querySelector('[data-execution-id="e_pin"]')?.getAttribute('data-pinned'), + ).toBe('true') + expect( + container.querySelector('[data-execution-id="e_unpin"]')?.getAttribute('data-pinned'), + ).toBe('false') + }) +}) + +// ============================================================================ +// Long executionId rendering — truncate visually, full id on title (PR6.5) +// ============================================================================ + +describe('PastRunsDrawer — UUID truncation', () => { + const LONG_UUID = '7f48f2d2-3c3f-4cf8-aae5-1234567890ab' + + it('renders only a short prefix of a long executionId in the visible text', () => { + const history: ReflogEntry[] = [mkEntry(LONG_UUID, '2026-06-11T10:00:00Z')] + const { container } = render( + , + ) + const entry = container.querySelector( + `[data-execution-id="${LONG_UUID}"]`, + ) as HTMLElement + // The visible id span should NOT contain the full UUID string. + const idSpan = entry.querySelector('[data-tree-execution-id-display]') as HTMLElement + expect(idSpan).not.toBeNull() + expect(idSpan.textContent ?? '').not.toContain(LONG_UUID) + // It should contain the leading 8-hex prefix so the operator can + // visually cross-reference logs. + expect(idSpan.textContent ?? '').toContain('7f48f2d2') + }) + + it('exposes the full executionId on the id span title attribute for hover lookup', () => { + const history: ReflogEntry[] = [mkEntry(LONG_UUID, '2026-06-11T10:00:00Z')] + const { container } = render( + , + ) + const idSpan = container.querySelector( + '[data-tree-execution-id-display]', + ) as HTMLElement + expect(idSpan.getAttribute('title')).toBe(LONG_UUID) + }) + + it('renders short executionIds (≤12 chars) unchanged — no ellipsis added', () => { + const history: ReflogEntry[] = [mkEntry('e_short', '2026-06-11T10:00:00Z')] + const { container } = render( + , + ) + const idSpan = container.querySelector( + '[data-tree-execution-id-display]', + ) as HTMLElement + expect(idSpan.textContent).toBe('e_short') + // Full id still on title for consistency. + expect(idSpan.getAttribute('title')).toBe('e_short') + }) +}) diff --git a/frontend/src/components/Tree/PastRunsDrawer.tsx b/frontend/src/components/Tree/PastRunsDrawer.tsx new file mode 100644 index 0000000000..ccaee784c4 --- /dev/null +++ b/frontend/src/components/Tree/PastRunsDrawer.tsx @@ -0,0 +1,164 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +/** + * Per-node "Past runs" drawer tab per spec §2.3. Pure presentational — + * the host owns the selected-node state and supplies the node's + * current `execution` + `executionHistory[]`. + * + * Per spec §6.6: `executionHistory` is the reflog. Entries are + * push-down LIFO; pinned entries survive cap-eviction. The drawer + * renders the current execution first (marked `data-current="true"`), + * then the history in caller-supplied order (host owns ordering so + * pinned-first / wave-grouped variants can land in V1.x without a + * component change). + */ + +import { Button } from '@fluentui/react-components' + +import type { + ConversationTreeNodeId, + ExecutionRecord, + ReflogEntry, +} from '../../runner/treeTypes' +import { usePastRunsDrawerStyles } from './PastRunsDrawer.styles' + +export interface PastRunsDrawerProps { + /** Which node this drawer is showing. Used for callback context only — the + * host already knows which node it's wiring; no display use. */ + nodeId: ConversationTreeNodeId + execution: ExecutionRecord | null + executionHistory: ReadonlyArray + /** + * Toggle pinned state for a past-run entry. The runner sink's + * `setReflogPinned` is the wire; host passes `(executionId, + * !currentlyPinned)`. + */ + onTogglePin?: (executionId: string, pinned: boolean) => void + /** + * Swap a past run back into the current execution slot. PR6e ships + * the contract; the host's `makeCurrent` plumbing (or runner sink + * extension) lands in V1.x per spec §6.7. + */ + onCheckout?: (executionId: string) => void +} + +export function PastRunsDrawer({ + nodeId: _nodeId, + execution, + executionHistory, + onTogglePin, + onCheckout, +}: PastRunsDrawerProps) { + const styles = usePastRunsDrawerStyles() + if (execution === null && executionHistory.length === 0) { + return ( +
+

No past runs.

+
+ ) + } + return ( +
+ {execution !== null && ( + + )} + {executionHistory.map((entry) => ( + + ))} +
+ ) +} + +function EntryRow({ + execution, + isCurrent, + pinned, + onTogglePin, + onCheckout, +}: { + execution: ExecutionRecord + isCurrent: boolean + pinned: boolean + onTogglePin?: (executionId: string, pinned: boolean) => void + onCheckout?: (executionId: string) => void +}) { + const styles = usePastRunsDrawerStyles() + const outcomeGlyph = outcomeGlyphFor(execution.outcome) + const waveSuffix = (execution.waveId ?? '').slice(0, 6) + return ( +
+ + {outcomeGlyph} + + + {truncateId(execution.executionId)} + + {execution.attemptedAt} + {waveSuffix !== '' && wave: {waveSuffix}} + {isCurrent && current} +
+ {!isCurrent && onTogglePin !== undefined && ( + + )} + {!isCurrent && onCheckout !== undefined && ( + + )} +
+
+ ) +} + +function truncateId(id: string): string { + if (id.length <= 12) return id + return `${id.slice(0, 8)}\u2026` +} + +function outcomeGlyphFor(outcome: ExecutionRecord['outcome']): string { + switch (outcome) { + case 'success': + return '✓' + case 'failure': + case 'error': + return '⚠' + case 'cancelled': + return '⦾' + case 'pending': + return '●' + } +} diff --git a/frontend/src/components/Tree/RootPromptCard.tsx b/frontend/src/components/Tree/RootPromptCard.tsx new file mode 100644 index 0000000000..825b12be45 --- /dev/null +++ b/frontend/src/components/Tree/RootPromptCard.tsx @@ -0,0 +1,144 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +import { + Button, + Input, + Textarea, + Tooltip, +} from '@fluentui/react-components' +import { EditRegular } from '@fluentui/react-icons' +import type { NodeProps } from '@xyflow/react' +import { useState } from 'react' + +import type { RootPromptNode } from '../../runner/treeTypes' +import { useActionCallbacks } from './actionCallbacksContext' +import { useEditorKeyboard } from './useEditorKeyboard' +import type { TreeFlowNode } from './conversationTreeToReactFlow' +import { CardBody, CardFrame, MetaRow } from './cardFrame' +import { useNodeCardStyles } from './nodeCards.styles' + +type RootPromptProps = NodeProps> + +export function RootPromptCard({ data, selected }: RootPromptProps) { + const node: RootPromptNode = data.node + const styles = useNodeCardStyles() + const callbacks = useActionCallbacks() + const onEditParams = callbacks?.onEditRootPromptParams + const [isEditing, setIsEditing] = useState(false) + const kindActions = + onEditParams !== undefined && !isEditing ? ( + +