Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/config/defaults.ts
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,10 @@ export interface QodexConfig {
/** Explicit model id for the independent judge. Must differ from defaults.model
* (self-grade is rejected). Falls back to the 'reflection' routing role when unset. */
judgeModel?: string;
/** Tier-2 (heavy/cloud) judge for the escalating cascade — used ONLY when the Tier-1
* judge is unsure (grey-zone average or high cross-dimension variance). Must differ from
* defaults.model and judgeModel. Unset ⇒ no escalation (Tier-1 verdict stands). */
judgeModelTier2?: string;
/** When auto-promoting, require at least this confidence (0–100). Default 0 (the
* judge's pass is sufficient); raise it to gate low-confidence captures. */
autoPromoteMinConfidence?: number;
Expand Down
62 changes: 53 additions & 9 deletions src/skills/learning/curator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,28 @@ import { ModelRouter } from '../../llm/router.js';
import { logger } from '../../utils/logger.js';
import { loadSkillByName } from '../loader.js';
import { listCandidates, readCandidate, promoteCandidate, writeCandidate, archiveCandidate } from './candidate-store.js';
import { buildJudgePrompt, parseJudgeVerdict, buildMergePrompt, parseMergeResult } from './judge.js';
import { buildMergePrompt, parseMergeResult } from './judge.js';
import {
buildRubricPrompt, parseRubricScores, shouldEscalate, rubricToVerdict,
alignmentDrift, buildCalibrationBlock, type DriftRecord, type RubricScores,
} from './judge-cascade.js';
import { decidePromotion } from './promotion.js';
import { snapshotSkills } from './snapshot.js';
import { findSimilarPairs, skillSimilarityText } from './similarity.js';
import { recordLearningEvent } from './ledger.js';
import { promises as fsp } from 'fs';
import * as os from 'os';
import * as nodePath from 'path';

const driftPath = () => nodePath.join(os.homedir(), '.qodex', 'judge-drift.jsonl');
async function readDrift(): Promise<DriftRecord[]> {
try { return (await fsp.readFile(driftPath(), 'utf-8')).split('\n').filter(Boolean).map(l => JSON.parse(l) as DriftRecord); }
catch { return []; }
}
async function appendDrift(r: DriftRecord): Promise<void> {
try { await fsp.mkdir(nodePath.dirname(driftPath()), { recursive: true }); await fsp.appendFile(driftPath(), JSON.stringify(r) + '\n', 'utf-8'); }
catch { /* best-effort */ }
}

export interface CurateResult {
snapshot: string | null;
Expand Down Expand Up @@ -122,6 +139,24 @@ export async function curateCandidates(
// Re-list after the merge pass so promotion sees the collapsed set.
const liveCandidates = (await listCandidates()).filter(c => !mergedAway.has(c.name));
const existingNames = liveCandidates.map(c => c.name);

// ── Escalating cascade setup ── Tier 1 is judgeRoute (the light/local judge). Tier 2 is an
// optional heavy model (learning.judgeModelTier2) we escalate to only when Tier 1 is unsure.
const tier2Model = String((config as any).learning?.judgeModelTier2 ?? '').trim();
let tier2Route: ReturnType<typeof router.route> | null = null;
if (tier2Model && tier2Model !== authorModel && tier2Model !== judgeRoute.model) {
try { tier2Route = router.route('reflection', 2000, { explicitModel: tier2Model }); } catch { tier2Route = null; }
}
// Calibration: if Tier 1 has been drifting from Tier 2, feed it the worst past corrections.
const driftRecords = await readDrift();
const calibration = buildCalibrationBlock(driftRecords, 3);

const scoreWith = async (route: NonNullable<typeof tier2Route>, md: string, calib: string): Promise<RubricScores | null> => {
const { system, user } = buildRubricPrompt(md, calib);
const text = await drainText(route.provider.complete({ model: route.model, messages: [{ role: 'system', content: system }, { role: 'user', content: user }], temperature: 0 } as any));
return parseRubricScores(text);
};

for (const c of liveCandidates) {
const md = await readCandidate(c.name);
if (!md) { result.skipped.push({ name: c.name, reason: 'unreadable' }); continue; }
Expand All @@ -132,16 +167,25 @@ export async function curateCandidates(
continue;
}

log(`Judging "${c.name}" with ${judgeRoute.model} …`);
// ── Escalating cascade: Tier 1 (light) scores; escalate to Tier 2 (heavy) only when unsure ──
let verdict;
try {
const { system, user } = buildJudgePrompt(md, existingNames.filter(n => n !== c.name));
const text = await drainText(judgeRoute.provider.complete({
model: judgeRoute.model,
messages: [{ role: 'system', content: system }, { role: 'user', content: user }],
temperature: 0,
} as any));
verdict = parseJudgeVerdict(text, judgeRoute.model);
log(`Judging "${c.name}" with ${judgeRoute.model} (Tier 1) …`);
const t1 = await scoreWith(judgeRoute, md, calibration);
let finalScores = t1, finalModel = judgeRoute.model, reasons: string[] = [];
// Escalate when Tier 1 couldn't score (null) or is in the grey/confused zone, and a Tier 2 exists.
if (tier2Route && (!t1 || shouldEscalate(t1))) {
log(` ↑ escalating "${c.name}" to ${tier2Route.model} (Tier 2)${t1 ? ` — avg/variance unclear` : ' — Tier 1 unparseable'} …`);
const t2 = await scoreWith(tier2Route, md, '');
if (t2) {
finalScores = t2; finalModel = tier2Route.model; reasons = [t2.justification].filter(Boolean);
if (t1) await appendDrift({ ts: new Date().toISOString(), tier1: t1, tier2: t2, drift: alignmentDrift(t1, t2) });
}
} else if (t1) {
reasons = [t1.justification].filter(Boolean);
}
if (!finalScores) { result.skipped.push({ name: c.name, reason: 'judge produced no parseable scores (Tier 1 + Tier 2)' }); continue; }
verdict = { pass: rubricToVerdict(finalScores), judgeModel: finalModel, reasons };
} catch (e: any) {
result.skipped.push({ name: c.name, reason: `judge call failed: ${e?.message}` });
continue;
Expand Down
130 changes: 130 additions & 0 deletions src/skills/learning/judge-cascade.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
/**
* Escalating cascade judge (Phase 5).
*
* Ensembling several heavy models per decision would crawl on local hardware. Instead we
* cascade: a fast LOCAL model (Tier 1) scores every candidate on a fixed rubric, and we
* escalate to a heavy CLOUD model (Tier 2) ONLY when Tier 1 is genuinely unsure —
* detected from the scores themselves:
*
* - Twilight zone: the average lands in the grey middle (5.5–7.5).
* - High variance: the rubric dimensions disagree sharply (σ > 2.5) — e.g. safety 10 but
* efficiency 2 — which means the local model is confused, not confident.
*
* So ~90% of clear-cut cases finish locally in <2s; the cloud is paid only when it matters.
*
* A light SELF-IMPROVEMENT loop (Feedback Alignment Drift): whenever we escalate, we log
* |Tier2 − Tier1| per dimension. If that drift trends UP, the curator injects a few Tier-2
* corrections as few-shot examples into Tier-1's prompt so the local judge re-calibrates.
*
* All scoring math is PURE and unit-tested.
*/
import { tryParseJson } from '../../llm/constrained.js';

export interface RubricScores {
readability: number;
efficiency: number;
completeness: number;
safety: number;
justification: string;
}

const DIMS = ['readability', 'efficiency', 'completeness', 'safety'] as const;

function dimValues(s: RubricScores): number[] {
return [s.readability, s.efficiency, s.completeness, s.safety];
}

export function rubricAverage(s: RubricScores): number {
const v = dimValues(s);
return v.reduce((a, b) => a + b, 0) / v.length;
}

export function rubricStdDev(s: RubricScores): number {
const v = dimValues(s);
const avg = v.reduce((a, b) => a + b, 0) / v.length;
const variance = v.reduce((a, b) => a + (b - avg) ** 2, 0) / v.length;
return Math.sqrt(variance);
}

export interface EscalationConfig {
twilightLow: number; // default 5.5
twilightHigh: number; // default 7.5
maxStdDev: number; // default 2.5
}
export const DEFAULT_ESCALATION: EscalationConfig = { twilightLow: 5.5, twilightHigh: 7.5, maxStdDev: 2.5 };

/** Escalate to the heavy model when Tier 1 is in the grey middle OR its dimensions disagree. */
export function shouldEscalate(scores: RubricScores, cfg: EscalationConfig = DEFAULT_ESCALATION): boolean {
const avg = rubricAverage(scores);
const inTwilight = avg >= cfg.twilightLow && avg <= cfg.twilightHigh;
const confused = rubricStdDev(scores) > cfg.maxStdDev;
return inTwilight || confused;
}

/** Map a rubric to a pass/fail verdict: pass only when clearly good and nothing is unsafe. */
export function rubricToVerdict(scores: RubricScores, passAvg = 7.5, minSafety = 6): boolean {
return rubricAverage(scores) >= passAvg && scores.safety >= minSafety;
}

// ── prompt + parse ────────────────────────────────────────────────────────────

export function buildRubricPrompt(candidateMd: string, calibrationExamples = ''): { system: string; user: string } {
const system =
'You are an independent reviewer scoring a machine-captured "skill" (a reusable playbook) ' +
'on FOUR dimensions, each 1–10:\n' +
' - readability: is the playbook clear and well-structured?\n' +
' - efficiency: is the approach it prescribes efficient (no needless steps)?\n' +
' - completeness: does it cover the task class, not just one instance?\n' +
' - safety: would following it avoid destructive / wrong actions?\n' +
'Be strict and honest; default low when unsure.' + (calibrationExamples ? `\n\n${calibrationExamples}` : '') +
'\n\nRespond with STRICT JSON only:\n' +
'{"readability":n,"efficiency":n,"completeness":n,"safety":n,"justification":"..."}';
const user = `## Candidate skill\n\`\`\`\n${candidateMd.slice(0, 8000)}\n\`\`\`\n\nScore it now.`;
return { system, user };
}

/** Parse rubric scores; clamps to 1–10. Returns null on unparseable output (caller treats
* a missing Tier-1 score as "escalate"). */
export function parseRubricScores(text: string): RubricScores | null {
const p = tryParseJson(text) as any;
if (!p || typeof p !== 'object') return null;
const num = (x: any) => (typeof x === 'number' && Number.isFinite(x) ? Math.max(1, Math.min(10, x)) : null);
const r = num(p.readability), e = num(p.efficiency), c = num(p.completeness), s = num(p.safety);
if (r === null || e === null || c === null || s === null) return null;
return { readability: r, efficiency: e, completeness: c, safety: s, justification: typeof p.justification === 'string' ? p.justification : '' };
}

// ── feedback alignment drift (self-improvement) ────────────────────────────────

export interface DriftRecord { ts: string; tier1: RubricScores; tier2: RubricScores; drift: number }

/** Mean absolute per-dimension difference between the two tiers — how far Tier 1 was off. */
export function alignmentDrift(tier1: RubricScores, tier2: RubricScores): number {
const a = dimValues(tier1), b = dimValues(tier2);
return a.reduce((sum, x, i) => sum + Math.abs(x - b[i]!), 0) / a.length;
}

/**
* Is Tier 1 drifting OUT of alignment? Compares the mean drift of the most recent `window`
* escalations to the prior `window`. Rising drift ⇒ recalibrate Tier 1. PURE.
*/
export function isDriftRising(records: DriftRecord[], window = 10): boolean {
if (records.length < window * 2) return false;
const recent = records.slice(-window);
const prior = records.slice(-window * 2, -window);
const mean = (rs: DriftRecord[]) => rs.reduce((a, r) => a + r.drift, 0) / rs.length;
return mean(recent) > mean(prior);
}

/** Build the few-shot calibration block from the worst recent disagreements, to inject into
* Tier 1's prompt so it learns to score like Tier 2. Empty when there's nothing to learn. */
export function buildCalibrationBlock(records: DriftRecord[], k = 3): string {
if (records.length === 0) return '';
const worst = [...records].sort((a, b) => b.drift - a.drift).slice(0, k);
if (worst.length === 0) return '';
const lines = ['CALIBRATION — on these, a senior reviewer scored differently than you tend to; match this calibration:'];
for (const r of worst) {
lines.push(`- correct scores: readability ${r.tier2.readability}, efficiency ${r.tier2.efficiency}, completeness ${r.tier2.completeness}, safety ${r.tier2.safety}${r.tier2.justification ? ` — ${r.tier2.justification.slice(0, 100)}` : ''}`);
}
return lines.join('\n');
}
78 changes: 78 additions & 0 deletions test/judge-cascade.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import { describe, it, expect } from 'vitest';
import {
rubricAverage, rubricStdDev, shouldEscalate, rubricToVerdict, parseRubricScores,
alignmentDrift, isDriftRising, buildCalibrationBlock, type RubricScores, type DriftRecord,
} from '../src/skills/learning/judge-cascade.js';

const R = (readability: number, efficiency: number, completeness: number, safety: number, justification = ''): RubricScores =>
({ readability, efficiency, completeness, safety, justification });

describe('rubric math', () => {
it('average + stdDev', () => {
expect(rubricAverage(R(8, 8, 8, 8))).toBe(8);
expect(rubricStdDev(R(8, 8, 8, 8))).toBe(0);
expect(rubricStdDev(R(10, 2, 9, 8))).toBeGreaterThan(2.5);
});
});

describe('shouldEscalate — escalate ONLY when Tier 1 is unsure', () => {
it('twilight zone (grey middle average) → escalate', () => {
expect(shouldEscalate(R(6, 7, 6, 7))).toBe(true); // avg 6.5, low variance
});
it('confidently HIGH (clear pass) → do NOT escalate', () => {
expect(shouldEscalate(R(9, 9, 8, 9))).toBe(false); // avg 8.75, low variance
});
it('confidently LOW (clear reject) → do NOT escalate', () => {
expect(shouldEscalate(R(2, 3, 2, 2))).toBe(false); // avg 2.25, low variance
});
it('HIGH VARIANCE (dimensions disagree) → escalate even outside twilight', () => {
expect(shouldEscalate(R(10, 2, 9, 8))).toBe(true); // confused: safety/efficiency clash
});
});

describe('rubricToVerdict', () => {
it('pass only when clearly good AND safe', () => {
expect(rubricToVerdict(R(9, 8, 8, 9))).toBe(true);
expect(rubricToVerdict(R(9, 9, 9, 3))).toBe(false); // unsafe → fail despite high avg
expect(rubricToVerdict(R(6, 6, 6, 6))).toBe(false); // mediocre → fail
});
});

describe('parseRubricScores — clamps, fails closed', () => {
it('parses + clamps to 1–10', () => {
expect(parseRubricScores('{"readability":8,"efficiency":7,"completeness":9,"safety":8,"justification":"ok"}'))
.toEqual(R(8, 7, 9, 8, 'ok'));
expect(parseRubricScores('{"readability":99,"efficiency":-5,"completeness":9,"safety":8}')!.readability).toBe(10);
expect(parseRubricScores('{"readability":99,"efficiency":-5,"completeness":9,"safety":8}')!.efficiency).toBe(1);
});
it('garbage / missing dimension → null (caller escalates)', () => {
expect(parseRubricScores('not json')).toBeNull();
expect(parseRubricScores('{"readability":8}')).toBeNull();
});
});

describe('feedback alignment drift — self-improvement signal', () => {
it('alignmentDrift is the mean per-dimension |diff|', () => {
expect(alignmentDrift(R(8, 8, 8, 8), R(8, 8, 8, 8))).toBe(0);
expect(alignmentDrift(R(10, 10, 10, 10), R(6, 6, 6, 6))).toBe(4);
});
const rec = (drift: number): DriftRecord => ({ ts: '', tier1: R(5, 5, 5, 5), tier2: R(5, 5, 5, 5), drift });
it('isDriftRising compares recent vs prior window', () => {
const rising = [...Array(10).fill(0).map(() => rec(1)), ...Array(10).fill(0).map(() => rec(3))];
expect(isDriftRising(rising, 10)).toBe(true);
const stable = Array(20).fill(0).map(() => rec(2));
expect(isDriftRising(stable, 10)).toBe(false);
expect(isDriftRising([rec(1)], 10)).toBe(false); // not enough data
});
it('buildCalibrationBlock surfaces the worst disagreements (Tier-2 scores)', () => {
const recs: DriftRecord[] = [
{ ts: '', tier1: R(9, 9, 9, 9), tier2: R(3, 3, 3, 3, 'overfit one-off'), drift: 6 },
{ ts: '', tier1: R(7, 7, 7, 7), tier2: R(7, 7, 7, 7), drift: 0 },
];
const block = buildCalibrationBlock(recs, 1);
expect(block).toContain('CALIBRATION');
expect(block).toContain('safety 3'); // the Tier-2 correction
expect(block).toContain('overfit one-off');
expect(buildCalibrationBlock([], 3)).toBe('');
});
});