From 8d981607862139ab1eb7a4b31123615292588126 Mon Sep 17 00:00:00 2001 From: Louise Lau Date: Fri, 26 Jun 2026 09:16:43 +0800 Subject: [PATCH] =?UTF-8?q?feat(skills):=20strengthen=20UCB1=20=E2=80=94?= =?UTF-8?q?=20composite=20reward,=20tunable=20c,=20trial=20floor,=20scores?= =?UTF-8?q?,=20off-switch?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All five requested upgrades to the Phase-4 bandit, in the pure tested core: 1. Tunable exploration factor — RouteOptions.explorationFactor (config learning.versioning.ucbExplorationFactor; default √2). Higher c keeps a barely-tested challenger in play longer. 2. Composite reward — UCB1's per-arm value is no longer raw success rate but a weighted blend of success + token-efficiency + time-efficiency (default 0.7/0.15/0.15, configurable). Success dominates (a cheap failure never beats an expensive success), but between two equally-successful versions the cheaper + faster one wins. Added totalDurationMs to VersionStats (back-compat optional); recordVersionExecution now takes durationMs; decideChampion converges on the composite reward too. 3. Minimum challenger trials — RouteOptions.minChallengerTrials (default 5): a challenger is force-routed until it clears the floor, so UCB1 never starves it (or a decision is never made) on too little signal. 4. UCB score history/analysis — ucbScores(manifest) returns reward + exploration bonus + ucb per arm; `qodex skill versions ` now prints the breakdown (and avg ms/run) for debugging. 5. Off-switch — routingStrategy 'champion-only' (config strategy) disables UCB and always routes the stable version — for sensitive skills you don't want experimented on. config: learning.versioning { ucbExplorationFactor, minChallengerTrials, rewardWeights, strategy }, threaded through versioned-store + the CLI. Live-verified: between two 100%-success versions, the cheaper/faster one won on composite reward and was promoted to champion. Tests: +6 (champion-only, trial floor, exploration-factor explore/exploit, composite reward incl. success- dominates, ucbScores snapshot) + the existing 12. typecheck + full suite (1222) + build green. --- src/cli/skill-command.ts | 17 +++- src/config/defaults.ts | 14 +++ src/skills/learning/skill-versioning.ts | 119 ++++++++++++++++++------ src/skills/learning/versioned-store.ts | 21 +++-- test/skill-versioning-ucb.test.ts | 76 +++++++++++++++ test/skill-versioning.test.ts | 6 +- 6 files changed, 214 insertions(+), 39 deletions(-) create mode 100644 test/skill-versioning-ucb.test.ts diff --git a/src/cli/skill-command.ts b/src/cli/skill-command.ts index 486ea27..fce88f5 100644 --- a/src/cli/skill-command.ts +++ b/src/cli/skill-command.ts @@ -196,18 +196,29 @@ export function buildSkillCommand(): Command { .action(async (name: string) => { const { loadSkillByName } = await import('../skills/loader.js'); const { readManifest } = await import('../skills/learning/versioned-store.js'); - const { routeSkillVersion } = await import('../skills/learning/skill-versioning.js'); + const { routeSkillVersion, ucbScores } = await import('../skills/learning/skill-versioning.js'); + const { loadConfig } = await import('../config/loader.js'); const spec = await loadSkillByName(name, process.cwd()); if (!spec) { console.error(`✗ no skill named "${name}"`); process.exit(1); } const m = await readManifest(spec.dir); if (!m) { console.log(`"${name}" is a single-version (legacy) skill — no version history yet.`); return; } - const routed = routeSkillVersion(m); + const vcfg = (await loadConfig(process.cwd()) as any).learning?.versioning ?? {}; + const opts = { + explorationFactor: vcfg.ucbExplorationFactor, + minChallengerTrials: vcfg.minChallengerTrials, + weights: vcfg.rewardWeights ? { success: vcfg.rewardWeights.success ?? 0.7, token: vcfg.rewardWeights.token ?? 0.15, time: vcfg.rewardWeights.time ?? 0.15 } : undefined, + }; + const routed = routeSkillVersion(m, opts); + const scores = new Map(ucbScores(m, opts).map(s => [s.version, s])); console.log(`Skill "${m.skillId}" · strategy: ${m.routingStrategy} · routed this turn → ${routed}\n`); for (const v of Object.values(m.versions).sort((a, b) => a.version.localeCompare(b.version, undefined, { numeric: true }))) { const tag = v.version === m.activeVersion ? '★ champion' : v.version === m.challengerVersion ? '⚡ challenger' : v.retired ? '✗ retired' : ''; const rate = v.stats.executions ? `${Math.round((v.stats.successes / v.stats.executions) * 100)}% over ${v.stats.executions}` : 'untested'; + const avgMs = v.stats.executions && v.stats.totalDurationMs ? ` · ${Math.round(v.stats.totalDurationMs / v.stats.executions)}ms/run` : ''; console.log(` ${v.version} [${v.author}] ${tag}`); - console.log(` success: ${rate} · tokens: ${v.stats.totalTokensUsed} · confidence: ${v.confidence}`); + console.log(` success: ${rate} · tokens: ${v.stats.totalTokensUsed}${avgMs} · confidence: ${v.confidence}`); + const s = scores.get(v.version); + if (s) console.log(` UCB: reward ${s.reward.toFixed(3)} + bonus ${s.bonus === Infinity ? '∞' : s.bonus.toFixed(3)} = ${s.ucb === Infinity ? '∞' : s.ucb.toFixed(3)}`); } }); diff --git a/src/config/defaults.ts b/src/config/defaults.ts index 15ef6a0..a713504 100644 --- a/src/config/defaults.ts +++ b/src/config/defaults.ts @@ -439,6 +439,20 @@ export interface QodexConfig { * judge is unsure (grey-zone average or high cross-dimension variance). Must differ from * defaults.model and judgeModel. Unset ⇒ no escalation (Tier-1 verdict stands). */ judgeModelTier2?: string; + /** Skill versioning + UCB1 adaptive-bandit routing knobs. */ + versioning?: { + /** UCB1 exploration factor `c` — higher explores challengers more. Default √2 (~1.41). */ + ucbExplorationFactor?: number; + /** Force-route a challenger at least this many times before UCB1 can starve it, so a + * decision is never made on too little signal. Default 5. */ + minChallengerTrials?: number; + /** Composite-reward weights (success + token-efficiency + time-efficiency). Defaults + * { success: 0.7, token: 0.15, time: 0.15 }. */ + rewardWeights?: { success?: number; token?: number; time?: number }; + /** Routing strategy when a manifest doesn't pin one: 'ucb1' (default), 'static', or + * 'champion-only' (UCB OFF — always the stable version, for sensitive skills). */ + strategy?: 'ucb1' | 'static' | 'champion-only'; + }; /** When auto-promoting, require at least this confidence (0–100). Default 0 (the * judge's pass is sufficient); raise it to gate low-confidence captures. */ autoPromoteMinConfidence?: number; diff --git a/src/skills/learning/skill-versioning.ts b/src/skills/learning/skill-versioning.ts index 1b76175..1840de1 100644 --- a/src/skills/learning/skill-versioning.ts +++ b/src/skills/learning/skill-versioning.ts @@ -22,6 +22,29 @@ export interface VersionStats { executions: number; successes: number; totalTokensUsed: number; + /** Total wall-clock across executions (ms) — fuels the time term of the composite reward. + * Optional for backward-compat with manifests written before reward weighting. */ + totalDurationMs?: number; +} + +export interface RewardWeights { + /** Weight on the success rate (dominant). */ + success: number; + /** Weight on token efficiency (cheaper = better). */ + token: number; + /** Weight on time efficiency (faster = better). */ + time: number; +} +export const DEFAULT_WEIGHTS: RewardWeights = { success: 0.7, token: 0.15, time: 0.15 }; + +export interface RouteOptions { + /** UCB1 exploration factor `c` (higher = more exploration). Default √2. */ + explorationFactor?: number; + /** Force-route a challenger until it has at least this many trials, BEFORE UCB1 can + * starve it — so a decision is never made on too little signal. Default 5. */ + minChallengerTrials?: number; + /** Composite-reward weights. */ + weights?: RewardWeights; } export interface VersionDetail { @@ -40,13 +63,15 @@ export interface SkillManifest { skillId: string; activeVersion: string; // the stable champion challengerVersion?: string; // the version under test - routingStrategy: 'static' | 'ucb1'; + /** 'ucb1' (adaptive bandit), 'static' (~25% challenger), or 'champion-only' (UCB OFF — + * always the stable version; for sensitive skills you don't want experimented on). */ + routingStrategy: 'static' | 'ucb1' | 'champion-only'; versions: Record; } /** A fresh manifest for a brand-new skill (its first version is v1, the champion). */ export function initManifest(skillId: string, author: 'human' | 'machine', confidence = 50, nowIso = ''): { manifest: SkillManifest; fileName: string } { - const v: VersionDetail = { version: 'v1', createdAt: nowIso, author, confidence, stats: { executions: 0, successes: 0, totalTokensUsed: 0 } }; + const v: VersionDetail = { version: 'v1', createdAt: nowIso, author, confidence, stats: { executions: 0, successes: 0, totalTokensUsed: 0, totalDurationMs: 0 } }; return { manifest: { skillId, activeVersion: 'v1', routingStrategy: 'ucb1', versions: { v1: v } }, fileName: 'SKILL.v1.md', @@ -92,31 +117,72 @@ export function createNextVersion( }; } -/** UCB1 score for a version given the total trials N. */ -function ucb1(v: VersionDetail, N: number, c: number): number { - if (v.stats.executions === 0) return Infinity; // always try an unsampled arm first - const mean = v.stats.successes / v.stats.executions; - return mean + c * Math.sqrt(Math.log(Math.max(1, N)) / v.stats.executions); +const clamp01 = (x: number) => Math.max(0, Math.min(1, x)); +const perExec = (total: number, exec: number) => (exec ? total / exec : 0); + +interface RewardNorm { maxTokensPerExec: number; maxMsPerExec: number } +function rewardNorm(arms: VersionDetail[]): RewardNorm { + let maxTokensPerExec = 0, maxMsPerExec = 0; + for (const v of arms) { + maxTokensPerExec = Math.max(maxTokensPerExec, perExec(v.stats.totalTokensUsed, v.stats.executions)); + maxMsPerExec = Math.max(maxMsPerExec, perExec(v.stats.totalDurationMs ?? 0, v.stats.executions)); + } + return { maxTokensPerExec, maxMsPerExec }; +} + +/** + * COMPOSITE reward in [0,1]: success rate dominates, with token- and time-EFFICIENCY + * nudges (cheaper / faster relative to the other arm scores higher). Efficiency terms are + * neutral (0.5) when there's no scale to normalize against. PURE. + */ +export function compositeReward(v: VersionDetail, norm: RewardNorm, weights: RewardWeights = DEFAULT_WEIGHTS): number { + if (v.stats.executions === 0) return 0; + const successRate = v.stats.successes / v.stats.executions; + const tokScore = norm.maxTokensPerExec > 0 ? 1 - perExec(v.stats.totalTokensUsed, v.stats.executions) / norm.maxTokensPerExec : 0.5; + const timeScore = norm.maxMsPerExec > 0 ? 1 - perExec(v.stats.totalDurationMs ?? 0, v.stats.executions) / norm.maxMsPerExec : 0.5; + const w = weights, denom = w.success + w.token + w.time || 1; + return (w.success * successRate + w.token * clamp01(tokScore) + w.time * clamp01(timeScore)) / denom; +} + +export interface UcbScore { version: string; reward: number; bonus: number; ucb: number; executions: number } + +/** Per-arm UCB breakdown (reward + exploration bonus) for the active+challenger — a pure + * snapshot for debugging / `qodex skill versions` / analysis. */ +export function ucbScores(manifest: SkillManifest, opts: RouteOptions = {}): UcbScore[] { + const c = opts.explorationFactor ?? Math.sqrt(2); + const arms = [manifest.activeVersion, manifest.challengerVersion] + .filter((x): x is string => !!x).map(v => manifest.versions[v]).filter((v): v is VersionDetail => !!v); + const norm = rewardNorm(arms); + const N = arms.reduce((s, v) => s + v.stats.executions, 0); + return arms.map(v => { + const reward = compositeReward(v, norm, opts.weights); + const bonus = v.stats.executions === 0 ? Infinity : c * Math.sqrt(Math.log(Math.max(1, N)) / v.stats.executions); + return { version: v.version, reward, bonus, ucb: reward + bonus, executions: v.stats.executions }; + }); } /** - * Choose which version to inject this turn. No challenger → the champion. UCB1 balances - * exploring the challenger against exploiting the better arm; an unsampled version is - * tried first (Infinity), and a challenger whose success rate collapses loses traffic. + * Choose which version to inject this turn: + * - no challenger / champion-only strategy → the champion (UCB OFF; sensitive skills), + * - static → ~25% challenger, + * - ucb1 → force-explore the challenger until it clears `minChallengerTrials` (so we never + * judge on too little signal), then pick the higher UCB (composite reward + bonus); + * ties go to the champion (don't disturb the stable version). */ -export function routeSkillVersion(manifest: SkillManifest, c: number = Math.sqrt(2)): string { +export function routeSkillVersion(manifest: SkillManifest, opts: RouteOptions = {}): string { const champ = manifest.activeVersion; const chal = manifest.challengerVersion; if (!chal || !manifest.versions[chal] || chal === champ) return champ; + if (manifest.routingStrategy === 'champion-only') return champ; + if (manifest.routingStrategy === 'static') return deterministicStatic(manifest) ? chal : champ; - if (manifest.routingStrategy === 'static') { - return deterministicStatic(manifest) ? chal : champ; - } - const v1 = manifest.versions[champ]!; - const v2 = manifest.versions[chal]!; - const N = v1.stats.executions + v2.stats.executions; - // Tie → champion (conservative: don't disturb the stable version on a tie). - return ucb1(v2, N, c) > ucb1(v1, N, c) ? chal : champ; + const minTrials = opts.minChallengerTrials ?? 5; + if (manifest.versions[chal]!.stats.executions < minTrials) return chal; // exploration floor + + const scores = ucbScores(manifest, opts); + const champU = scores.find(s => s.version === champ)?.ucb ?? -Infinity; + const chalU = scores.find(s => s.version === chal)?.ucb ?? -Infinity; + return chalU > champU ? chal : champ; } /** Deterministic ~25% challenger pick for the 'static' strategy (no Math.random — keeps the @@ -131,7 +197,7 @@ function deterministicStatic(manifest: SkillManifest): boolean { export function recordVersionExecution( manifest: SkillManifest, version: string, - outcome: { success: boolean; tokens?: number }, + outcome: { success: boolean; tokens?: number; durationMs?: number }, ): SkillManifest { const v = manifest.versions[version]; if (!v) return manifest; @@ -141,6 +207,7 @@ export function recordVersionExecution( executions: v.stats.executions + 1, successes: v.stats.successes + (outcome.success ? 1 : 0), totalTokensUsed: v.stats.totalTokensUsed + (outcome.tokens ?? 0), + totalDurationMs: (v.stats.totalDurationMs ?? 0) + (outcome.durationMs ?? 0), }, }; return { ...manifest, versions: { ...manifest.versions, [version]: updated } }; @@ -152,16 +219,15 @@ export interface ChampionDecision { reason: string; } -const rate = (v: VersionDetail) => (v.stats.executions ? v.stats.successes / v.stats.executions : 0); - /** - * Converge the A/B test. Once the challenger has enough samples: + * Converge the A/B test on the COMPOSITE reward (success + token + time), once the + * challenger has enough samples: * - clearly BETTER than the champion (by `margin`) → PROMOTE it to active. - * - clearly WORSE → RETIRE it (drop the challenger, keep the champion). + * - clearly WORSE → RETIRE it (kept in history, marked retired). * - otherwise keep testing. Below `minExecutions` we never decide (too little signal). * PURE. */ -export function decideChampion(manifest: SkillManifest, opts: { minExecutions?: number; margin?: number } = {}): ChampionDecision { +export function decideChampion(manifest: SkillManifest, opts: { minExecutions?: number; margin?: number; weights?: RewardWeights } = {}): ChampionDecision { const minExec = opts.minExecutions ?? 8; const margin = opts.margin ?? 0.1; const chal = manifest.challengerVersion; @@ -171,7 +237,8 @@ export function decideChampion(manifest: SkillManifest, opts: { minExecutions?: const chalV = manifest.versions[chal]!; if (chalV.stats.executions < minExec) return { manifest, action: 'keep-testing', reason: `challenger has ${chalV.stats.executions}/${minExec} executions` }; - const cr = rate(chalV), pr = rate(champV); + const norm = rewardNorm([champV, chalV]); + const cr = compositeReward(chalV, norm, opts.weights), pr = compositeReward(champV, norm, opts.weights); if (cr >= pr + margin) { const m: SkillManifest = { ...manifest, activeVersion: chal, challengerVersion: undefined }; return { manifest: m, action: 'promote', reason: `challenger ${(cr * 100).toFixed(0)}% beat champion ${(pr * 100).toFixed(0)}% by ≥${margin * 100}%` }; diff --git a/src/skills/learning/versioned-store.ts b/src/skills/learning/versioned-store.ts index a1b5cbd..d2cde6d 100644 --- a/src/skills/learning/versioned-store.ts +++ b/src/skills/learning/versioned-store.ts @@ -12,7 +12,7 @@ import { promises as fs } from 'fs'; import * as path from 'path'; import { - type SkillManifest, type ChampionDecision, + type SkillManifest, type ChampionDecision, type RouteOptions, initManifest, createNextVersion, routeSkillVersion, recordVersionExecution, decideChampion, versionFileName, } from './skill-versioning.js'; @@ -29,11 +29,12 @@ async function writeManifest(skillDir: string, m: SkillManifest): Promise await fs.rename(tmp, manifestPath(skillDir)); // atomic } -/** The skill body to inject this turn + which version it is. Falls back to legacy SKILL.md. */ -export async function routedSkillBody(skillDir: string): Promise<{ version: string; body: string } | null> { +/** The skill body to inject this turn + which version it is. Falls back to legacy SKILL.md. + * `opts` carries the configurable UCB knobs (exploration factor, min trials, weights). */ +export async function routedSkillBody(skillDir: string, opts: RouteOptions = {}): Promise<{ version: string; body: string } | null> { const m = await readManifest(skillDir); if (m) { - const version = routeSkillVersion(m); + const version = routeSkillVersion(m, opts); try { return { version, body: await fs.readFile(path.join(skillDir, versionFileName(version)), 'utf-8') }; } catch { /* fall through to legacy */ } } @@ -64,12 +65,18 @@ export async function addChallenger(skillDir: string, skillId: string, body: str return updatedManifest; } -/** Record one execution outcome for the routed version, then try to converge the A/B test. */ -export async function recordOutcomeAndConverge(skillDir: string, version: string, outcome: { success: boolean; tokens?: number }): Promise { +/** Record one execution outcome (success + tokens + duration) for the routed version, then + * try to converge the A/B test on the composite reward. */ +export async function recordOutcomeAndConverge( + skillDir: string, + version: string, + outcome: { success: boolean; tokens?: number; durationMs?: number }, + opts: { minExecutions?: number; margin?: number } = {}, +): Promise { const m = await readManifest(skillDir); if (!m) return null; const afterStats = recordVersionExecution(m, version, outcome); - const decision = decideChampion(afterStats); + const decision = decideChampion(afterStats, opts); await writeManifest(skillDir, decision.manifest); return decision; } diff --git a/test/skill-versioning-ucb.test.ts b/test/skill-versioning-ucb.test.ts new file mode 100644 index 0000000..e6f57ce --- /dev/null +++ b/test/skill-versioning-ucb.test.ts @@ -0,0 +1,76 @@ +import { describe, it, expect } from 'vitest'; +import { + initManifest, createNextVersion, recordVersionExecution, routeSkillVersion, decideChampion, + compositeReward, ucbScores, type SkillManifest, type VersionDetail, +} from '../src/skills/learning/skill-versioning.js'; + +const challenger = () => createNextVersion(initManifest('s', 'machine', 50, '').manifest, 'machine').updatedManifest; +function feed(m: SkillManifest, v: string, runs: Array<{ success: boolean; tokens?: number; durationMs?: number }>): SkillManifest { + for (const r of runs) m = recordVersionExecution(m, v, r); + return m; +} +const ten = (success: boolean, tokens = 0, durationMs = 0) => Array(10).fill(0).map(() => ({ success, tokens, durationMs })); + +describe('#5 champion-only — UCB OFF for sensitive skills', () => { + it('always routes the champion regardless of a strong challenger', () => { + let m = challenger(); + m = { ...m, routingStrategy: 'champion-only' }; + m = feed(m, 'v2', ten(true)); // perfect challenger + expect(routeSkillVersion(m)).toBe(m.activeVersion); + }); +}); + +describe('#3 minChallengerTrials — force exploration before serious judgment', () => { + it('routes the challenger until it clears the floor, even if it looks bad early', () => { + let m = challenger(); + m = feed(m, 'v1', ten(true)); // champion strong + m = feed(m, 'v2', [{ success: false }, { success: false }]); // 2 bad trials + expect(routeSkillVersion(m, { minChallengerTrials: 5 })).toBe('v2'); // still forced + m = feed(m, 'v2', [{ success: false }, { success: false }, { success: false }]); // now 5 + expect(routeSkillVersion(m, { minChallengerTrials: 5 })).toBe(m.activeVersion); // UCB takes over → champion + }); +}); + +describe('#1 ucbExplorationFactor — tunes explore vs exploit', () => { + it('a higher c keeps a barely-tested challenger in play longer', () => { + let m = challenger(); + m = feed(m, 'v1', ten(true)); // champion 100% over 10 + m = feed(m, 'v2', [...ten(true).slice(0, 5), { success: false }]); // challenger ~83% over 6 + // tiny c → exploit champion; large c → explore challenger + expect(routeSkillVersion(m, { explorationFactor: 0.1, minChallengerTrials: 5 })).toBe(m.activeVersion); + expect(routeSkillVersion(m, { explorationFactor: 5, minChallengerTrials: 5 })).toBe('v2'); + }); +}); + +describe('#2 composite reward — success + token + time efficiency', () => { + it('between two EQUALLY-successful versions, the cheaper + faster one scores higher', () => { + let m = challenger(); + m = feed(m, 'v1', ten(true, 1000, 2000)); // champion: 100% but expensive/slow + m = feed(m, 'v2', ten(true, 200, 400)); // challenger: 100% but cheap/fast + const champReward = compositeReward(m.versions.v1!, { maxTokensPerExec: 1000, maxMsPerExec: 2000 }); + const chalReward = compositeReward(m.versions.v2!, { maxTokensPerExec: 1000, maxMsPerExec: 2000 }); + expect(chalReward).toBeGreaterThan(champReward); + // and decideChampion promotes the more efficient one + expect(decideChampion(m, { minExecutions: 8 }).action).toBe('promote'); + }); + it('success still dominates — a cheap FAILURE never beats an expensive success', () => { + const norm = { maxTokensPerExec: 1000, maxMsPerExec: 2000 }; + const good: VersionDetail = { version: 'a', createdAt: '', author: 'machine', confidence: 50, stats: { executions: 10, successes: 9, totalTokensUsed: 10000, totalDurationMs: 20000 } }; + const cheapFail: VersionDetail = { version: 'b', createdAt: '', author: 'machine', confidence: 50, stats: { executions: 10, successes: 2, totalTokensUsed: 100, totalDurationMs: 100 } }; + expect(compositeReward(good, norm)).toBeGreaterThan(compositeReward(cheapFail, norm)); + }); +}); + +describe('#4 ucbScores — debugging/analysis snapshot', () => { + it('exposes reward + bonus + ucb per arm', () => { + let m = challenger(); + m = feed(m, 'v1', ten(true)); + m = feed(m, 'v2', ten(false)); + const scores = ucbScores(m); + expect(scores.map(s => s.version).sort()).toEqual(['v1', 'v2']); + const v1 = scores.find(s => s.version === 'v1')!; + expect(v1.reward).toBeGreaterThan(0); + expect(v1.ucb).toBeCloseTo(v1.reward + v1.bonus, 5); + expect(v1.executions).toBe(10); + }); +}); diff --git a/test/skill-versioning.test.ts b/test/skill-versioning.test.ts index 69d807a..6813568 100644 --- a/test/skill-versioning.test.ts +++ b/test/skill-versioning.test.ts @@ -63,9 +63,9 @@ describe('routeSkillVersion — UCB1 explore/exploit', () => { describe('recordVersionExecution — pure stat update', () => { it('increments executions/successes/tokens', () => { let m = initManifest('s', 'machine', 50, '').manifest; - m = recordVersionExecution(m, 'v1', { success: true, tokens: 100 }); - m = recordVersionExecution(m, 'v1', { success: false, tokens: 50 }); - expect(m.versions.v1!.stats).toEqual({ executions: 2, successes: 1, totalTokensUsed: 150 }); + m = recordVersionExecution(m, 'v1', { success: true, tokens: 100, durationMs: 1000 }); + m = recordVersionExecution(m, 'v1', { success: false, tokens: 50, durationMs: 500 }); + expect(m.versions.v1!.stats).toEqual({ executions: 2, successes: 1, totalTokensUsed: 150, totalDurationMs: 1500 }); }); it('unknown version is a no-op', () => { const m = initManifest('s', 'machine', 50, '').manifest;