diff --git a/README.md b/README.md index 825f8c9..2ffe58d 100644 --- a/README.md +++ b/README.md @@ -133,6 +133,35 @@ qodex skill snapshots # rollback points; qodex skill restore Every successful task can also be exported as a **ShareGPT JSONL** corpus (`flywheel.datasetExport: true` → `~/.qodex/dataset/`) — a ready-to-use dataset for a future zero-cost local fine-tune. Strictly local; nothing is uploaded. +### Skill versioning & A/B testing (UCB1) + +A skill keeps its whole history in **one flat directory** — `manifest.json` + `SKILL.v1.md`, `SKILL.v2.md`, … — no symlinks, identical on every OS. When a new candidate is captured for an existing skill it becomes a **challenger** to the stable **champion**, and QodeX routes traffic between them with the **UCB1 adaptive bandit** instead of a fixed split: it explores the challenger enough to get signal, then favours whichever has the higher score — and a challenger that turns out worse has its traffic driven to **zero automatically**. + +The score is a **composite reward**, not just win-rate: *success* dominates, but *token-* and *time-efficiency* (normalized **relative to the champion**) break ties — so between two equally-correct versions, the **cheaper, faster** one wins. + +```yaml +learning: + versioning: + strategy: ucb1 # or 'champion-only' to freeze a sensitive skill (UCB off) + ucbExplorationFactor: 1.41 # √2 — higher explores challengers more + minChallengerTrials: 5 # force a challenger ≥5 runs before judging it + rewardWeights: { success: 0.7, token: 0.15, time: 0.15 } +``` + +```text +$ qodex skill versions git-commit-expert +Skill "git-commit-expert" · strategy: ucb1 · routed this turn → v2 + + v1 [human] ★ champion + success: 88% over 40 · tokens: 60000 · 1900ms/run · confidence: 75 + UCB: reward 0.838 + bonus 0.214 = 1.052 + v2 [machine] ⚡ challenger + success: 92% over 12 · tokens: 41000 · 1300ms/run · confidence: 60 + UCB: reward 0.921 + bonus 0.391 = 1.312 ← higher → gets this turn + +$ qodex skill rollback git-commit-expert v1 # snap the champion back to v1 anytime +``` + ## Install **Prerequisites:** **Node 20+** (Node 22 LTS recommended) and **Git**. `dist/` is built locally (not committed), so the `npm run build` step is **required** on every platform. The build links two commands — `qodex` and the short alias `qx`. diff --git a/src/cli/skill-command.ts b/src/cli/skill-command.ts index fce88f5..d42fb47 100644 --- a/src/cli/skill-command.ts +++ b/src/cli/skill-command.ts @@ -222,6 +222,25 @@ export function buildSkillCommand(): Command { } }); + cmd + .command('rollback ') + .description('Roll a versioned skill\'s champion back to an earlier version (e.g. v1) — drops any challenger') + .action(async (name: string, version: string) => { + const { loadSkillByName } = await import('../skills/loader.js'); + const { rollbackToVersion } = await import('../skills/learning/versioned-store.js'); + const spec = await loadSkillByName(name, process.cwd()); + if (!spec) { console.error(`✗ no skill named "${name}"`); process.exit(1); } + const ver = version.startsWith('v') ? version : `v${version}`; + const ok = await rollbackToVersion(spec.dir, ver); + if (ok) { + console.log(`✓ "${name}" rolled back — champion is now ${ver}.`); + await refreshSkillRegistry(); + } else { + console.error(`✗ "${name}" has no version ${ver} (or isn't versioned). Run \`qodex skill versions ${name}\`.`); + process.exit(1); + } + }); + cmd .command('lessons') .description('Show "learned cautions" mined from your RECURRING tool failures (failure-driven learning)') diff --git a/src/skills/learning/skill-versioning.ts b/src/skills/learning/skill-versioning.ts index 1840de1..2a3ac25 100644 --- a/src/skills/learning/skill-versioning.ts +++ b/src/skills/learning/skill-versioning.ts @@ -120,28 +120,36 @@ export function createNextVersion( const clamp01 = (x: number) => Math.max(0, Math.min(1, x)); const perExec = (total: number, exec: number) => (exec ? total / exec : 0); -interface RewardNorm { maxTokensPerExec: number; maxMsPerExec: number } -function rewardNorm(arms: VersionDetail[]): RewardNorm { - let maxTokensPerExec = 0, maxMsPerExec = 0; - for (const v of arms) { - maxTokensPerExec = Math.max(maxTokensPerExec, perExec(v.stats.totalTokensUsed, v.stats.executions)); - maxMsPerExec = Math.max(maxMsPerExec, perExec(v.stats.totalDurationMs ?? 0, v.stats.executions)); - } - return { maxTokensPerExec, maxMsPerExec }; +/** The CHAMPION's per-execution cost/latency — the reference everything is normalized + * against, so efficiency means "vs the stable version". */ +export interface RewardRef { champTokensPerExec: number; champMsPerExec: number } +export function championRef(champion: VersionDetail): RewardRef { + return { + champTokensPerExec: perExec(champion.stats.totalTokensUsed, champion.stats.executions), + champMsPerExec: perExec(champion.stats.totalDurationMs ?? 0, champion.stats.executions), + }; +} + +/** Efficiency in [0,1] normalized against the champion: at champion cost → 0.5 (baseline), + * free → 1.0, twice the champion's cost → 0.0. Neutral (0.5) when the champion has no scale. */ +function efficiency(vPerExec: number, champPerExec: number): number { + if (champPerExec <= 0) return 0.5; + return clamp01(1 - 0.5 * (vPerExec / champPerExec)); } /** - * COMPOSITE reward in [0,1]: success rate dominates, with token- and time-EFFICIENCY - * nudges (cheaper / faster relative to the other arm scores higher). Efficiency terms are - * neutral (0.5) when there's no scale to normalize against. PURE. + * COMPOSITE reward in [0,1]: success rate dominates, with token- and time-EFFICIENCY nudges + * measured RELATIVE TO THE CHAMPION (the stable version is the baseline a challenger must + * beat). A version cheaper/faster than the champion scores above the 0.5 efficiency + * baseline; one twice as costly scores 0. PURE. */ -export function compositeReward(v: VersionDetail, norm: RewardNorm, weights: RewardWeights = DEFAULT_WEIGHTS): number { +export function compositeReward(v: VersionDetail, ref: RewardRef, weights: RewardWeights = DEFAULT_WEIGHTS): number { if (v.stats.executions === 0) return 0; const successRate = v.stats.successes / v.stats.executions; - const tokScore = norm.maxTokensPerExec > 0 ? 1 - perExec(v.stats.totalTokensUsed, v.stats.executions) / norm.maxTokensPerExec : 0.5; - const timeScore = norm.maxMsPerExec > 0 ? 1 - perExec(v.stats.totalDurationMs ?? 0, v.stats.executions) / norm.maxMsPerExec : 0.5; + const tokScore = efficiency(perExec(v.stats.totalTokensUsed, v.stats.executions), ref.champTokensPerExec); + const timeScore = efficiency(perExec(v.stats.totalDurationMs ?? 0, v.stats.executions), ref.champMsPerExec); const w = weights, denom = w.success + w.token + w.time || 1; - return (w.success * successRate + w.token * clamp01(tokScore) + w.time * clamp01(timeScore)) / denom; + return (w.success * successRate + w.token * tokScore + w.time * timeScore) / denom; } export interface UcbScore { version: string; reward: number; bonus: number; ucb: number; executions: number } @@ -152,10 +160,10 @@ export function ucbScores(manifest: SkillManifest, opts: RouteOptions = {}): Ucb const c = opts.explorationFactor ?? Math.sqrt(2); const arms = [manifest.activeVersion, manifest.challengerVersion] .filter((x): x is string => !!x).map(v => manifest.versions[v]).filter((v): v is VersionDetail => !!v); - const norm = rewardNorm(arms); + const ref = championRef(manifest.versions[manifest.activeVersion]!); const N = arms.reduce((s, v) => s + v.stats.executions, 0); return arms.map(v => { - const reward = compositeReward(v, norm, opts.weights); + const reward = compositeReward(v, ref, opts.weights); const bonus = v.stats.executions === 0 ? Infinity : c * Math.sqrt(Math.log(Math.max(1, N)) / v.stats.executions); return { version: v.version, reward, bonus, ucb: reward + bonus, executions: v.stats.executions }; }); @@ -237,8 +245,8 @@ export function decideChampion(manifest: SkillManifest, opts: { minExecutions?: const chalV = manifest.versions[chal]!; if (chalV.stats.executions < minExec) return { manifest, action: 'keep-testing', reason: `challenger has ${chalV.stats.executions}/${minExec} executions` }; - const norm = rewardNorm([champV, chalV]); - const cr = compositeReward(chalV, norm, opts.weights), pr = compositeReward(champV, norm, opts.weights); + const ref = championRef(champV); + const cr = compositeReward(chalV, ref, opts.weights), pr = compositeReward(champV, ref, opts.weights); if (cr >= pr + margin) { const m: SkillManifest = { ...manifest, activeVersion: chal, challengerVersion: undefined }; return { manifest: m, action: 'promote', reason: `challenger ${(cr * 100).toFixed(0)}% beat champion ${(pr * 100).toFixed(0)}% by ≥${margin * 100}%` }; diff --git a/src/skills/learning/versioned-store.ts b/src/skills/learning/versioned-store.ts index d2cde6d..1d62349 100644 --- a/src/skills/learning/versioned-store.ts +++ b/src/skills/learning/versioned-store.ts @@ -65,6 +65,22 @@ export async function addChallenger(skillDir: string, skillId: string, body: str return updatedManifest; } +/** Roll a skill's champion back to an earlier version: set it active, drop any challenger, + * and un-retire it if it had lost a past A/B. Returns false if the version doesn't exist. */ +export async function rollbackToVersion(skillDir: string, version: string): Promise { + const m = await readManifest(skillDir); + if (!m || !m.versions[version]) return false; + const v = { ...m.versions[version]!, retired: false }; + const updated: SkillManifest = { + ...m, + activeVersion: version, + challengerVersion: m.challengerVersion === version ? undefined : m.challengerVersion, + versions: { ...m.versions, [version]: v }, + }; + await writeManifest(skillDir, updated); + return true; +} + /** Record one execution outcome (success + tokens + duration) for the routed version, then * try to converge the A/B test on the composite reward. */ export async function recordOutcomeAndConverge( diff --git a/test/skill-versioning-ucb.test.ts b/test/skill-versioning-ucb.test.ts index e6f57ce..35c0280 100644 --- a/test/skill-versioning-ucb.test.ts +++ b/test/skill-versioning-ucb.test.ts @@ -1,7 +1,7 @@ import { describe, it, expect } from 'vitest'; import { initManifest, createNextVersion, recordVersionExecution, routeSkillVersion, decideChampion, - compositeReward, ucbScores, type SkillManifest, type VersionDetail, + compositeReward, championRef, ucbScores, type SkillManifest, type VersionDetail, } from '../src/skills/learning/skill-versioning.js'; const challenger = () => createNextVersion(initManifest('s', 'machine', 50, '').manifest, 'machine').updatedManifest; @@ -42,22 +42,28 @@ describe('#1 ucbExplorationFactor — tunes explore vs exploit', () => { }); }); -describe('#2 composite reward — success + token + time efficiency', () => { - it('between two EQUALLY-successful versions, the cheaper + faster one scores higher', () => { +describe('#2 composite reward — efficiency normalized RELATIVE TO THE CHAMPION', () => { + it('between two EQUALLY-successful versions, the cheaper + faster challenger beats the champion', () => { let m = challenger(); m = feed(m, 'v1', ten(true, 1000, 2000)); // champion: 100% but expensive/slow m = feed(m, 'v2', ten(true, 200, 400)); // challenger: 100% but cheap/fast - const champReward = compositeReward(m.versions.v1!, { maxTokensPerExec: 1000, maxMsPerExec: 2000 }); - const chalReward = compositeReward(m.versions.v2!, { maxTokensPerExec: 1000, maxMsPerExec: 2000 }); - expect(chalReward).toBeGreaterThan(champReward); - // and decideChampion promotes the more efficient one - expect(decideChampion(m, { minExecutions: 8 }).action).toBe('promote'); + const ref = championRef(m.versions.v1!); // normalize vs the champion + expect(compositeReward(m.versions.v1!, ref)).toBeCloseTo(0.85, 2); // champion → 0.5 efficiency baseline + expect(compositeReward(m.versions.v2!, ref)).toBeGreaterThan(compositeReward(m.versions.v1!, ref)); + expect(decideChampion(m, { minExecutions: 8 }).action).toBe('promote'); // cheaper wins + }); + it('a challenger TWICE the champion cost is penalized (efficiency → 0)', () => { + let m = challenger(); + m = feed(m, 'v1', ten(true, 500, 1000)); + m = feed(m, 'v2', ten(true, 1000, 2000)); // 2× the champion's cost, same success + const ref = championRef(m.versions.v1!); + expect(compositeReward(m.versions.v2!, ref)).toBeLessThan(compositeReward(m.versions.v1!, ref)); }); it('success still dominates — a cheap FAILURE never beats an expensive success', () => { - const norm = { maxTokensPerExec: 1000, maxMsPerExec: 2000 }; const good: VersionDetail = { version: 'a', createdAt: '', author: 'machine', confidence: 50, stats: { executions: 10, successes: 9, totalTokensUsed: 10000, totalDurationMs: 20000 } }; const cheapFail: VersionDetail = { version: 'b', createdAt: '', author: 'machine', confidence: 50, stats: { executions: 10, successes: 2, totalTokensUsed: 100, totalDurationMs: 100 } }; - expect(compositeReward(good, norm)).toBeGreaterThan(compositeReward(cheapFail, norm)); + const ref = championRef(good); // champion is the good one + expect(compositeReward(good, ref)).toBeGreaterThan(compositeReward(cheapFail, ref)); }); });