From 548bea3f9c6669e1e0f40139e85f3374c7a501d5 Mon Sep 17 00:00:00 2001
From: Louise Lau <QodeXcli@users.noreply.github.com>
Date: Thu, 25 Jun 2026 23:02:17 +0800
Subject: [PATCH] feat(skills): escalating-cascade judge + alignment-drift
 self-improvement (Phase 5)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per the proposed architecture: instead of an ensemble of heavy models per
decision (which crawls on local hardware), the judge CASCADES — a fast Tier-1
model scores every candidate on a 4-axis rubric, and we escalate to a heavy
Tier-2 model ONLY when Tier-1 is genuinely unsure. So clear-cut cases finish
locally; the cloud is paid only when it matters.

- src/skills/learning/judge-cascade.ts (PURE, unit-tested): RubricScores +
  rubricAverage/rubricStdDev; shouldEscalate — escalate on the TWILIGHT ZONE
  (avg 5.5–7.5) OR HIGH VARIANCE (σ > 2.5, dimensions disagree → model confused);
  rubricToVerdict; buildRubricPrompt/parseRubricScores (clamps 1–10, fails closed →
  escalate). Self-improvement: alignmentDrift (mean |Tier2−Tier1|), isDriftRising
  (recent vs prior window), buildCalibrationBlock (inject Tier-2's worst
  corrections as few-shot so Tier-1 re-calibrates).
- curator.ts integration: Tier-1 (judgeRoute) scores with the rubric + any
  calibration block; if it can't parse or shouldEscalate fires AND a Tier-2 model
  is configured (learning.judgeModelTier2), it re-scores with Tier-2 and logs the
  drift to ~/.qodex/judge-drift.jsonl. The rubric verdict feeds the existing
  independence + human-protection promotion gate unchanged.
- config: learning.judgeModelTier2 (the heavy judge; unset ⇒ no escalation).

Tests: 11 (escalation gates: twilight / high-variance / confident-pass /
confident-reject; rubric verdict incl. unsafe→fail; parse clamp + fail-closed;
drift mean, rising-window, calibration block). typecheck + full suite (1200) +
build green.
---
 src/config/defaults.ts               |   4 +
 src/skills/learning/curator.ts       |  62 +++++++++++--
 src/skills/learning/judge-cascade.ts | 130 +++++++++++++++++++++++++++
 test/judge-cascade.test.ts           |  78 ++++++++++++++++
 4 files changed, 265 insertions(+), 9 deletions(-)
 create mode 100644 src/skills/learning/judge-cascade.ts
 create mode 100644 test/judge-cascade.test.ts

diff --git a/src/config/defaults.ts b/src/config/defaults.ts
index 5e135e0..15ef6a0 100644
--- a/src/config/defaults.ts
+++ b/src/config/defaults.ts
@@ -435,6 +435,10 @@ export interface QodexConfig {
     /** Explicit model id for the independent judge. Must differ from defaults.model
      *  (self-grade is rejected). Falls back to the 'reflection' routing role when unset. */
     judgeModel?: string;
+    /** Tier-2 (heavy/cloud) judge for the escalating cascade — used ONLY when the Tier-1
+     *  judge is unsure (grey-zone average or high cross-dimension variance). Must differ from
+     *  defaults.model and judgeModel. Unset ⇒ no escalation (Tier-1 verdict stands). */
+    judgeModelTier2?: string;
     /** When auto-promoting, require at least this confidence (0–100). Default 0 (the
      *  judge's pass is sufficient); raise it to gate low-confidence captures. */
     autoPromoteMinConfidence?: number;
diff --git a/src/skills/learning/curator.ts b/src/skills/learning/curator.ts
index b0e28e0..8716585 100644
--- a/src/skills/learning/curator.ts
+++ b/src/skills/learning/curator.ts
@@ -19,11 +19,28 @@ import { ModelRouter } from '../../llm/router.js';
 import { logger } from '../../utils/logger.js';
 import { loadSkillByName } from '../loader.js';
 import { listCandidates, readCandidate, promoteCandidate, writeCandidate, archiveCandidate } from './candidate-store.js';
-import { buildJudgePrompt, parseJudgeVerdict, buildMergePrompt, parseMergeResult } from './judge.js';
+import { buildMergePrompt, parseMergeResult } from './judge.js';
+import {
+  buildRubricPrompt, parseRubricScores, shouldEscalate, rubricToVerdict,
+  alignmentDrift, buildCalibrationBlock, type DriftRecord, type RubricScores,
+} from './judge-cascade.js';
 import { decidePromotion } from './promotion.js';
 import { snapshotSkills } from './snapshot.js';
 import { findSimilarPairs, skillSimilarityText } from './similarity.js';
 import { recordLearningEvent } from './ledger.js';
+import { promises as fsp } from 'fs';
+import * as os from 'os';
+import * as nodePath from 'path';
+
+const driftPath = () => nodePath.join(os.homedir(), '.qodex', 'judge-drift.jsonl');
+async function readDrift(): Promise<DriftRecord[]> {
+  try { return (await fsp.readFile(driftPath(), 'utf-8')).split('\n').filter(Boolean).map(l => JSON.parse(l) as DriftRecord); }
+  catch { return []; }
+}
+async function appendDrift(r: DriftRecord): Promise<void> {
+  try { await fsp.mkdir(nodePath.dirname(driftPath()), { recursive: true }); await fsp.appendFile(driftPath(), JSON.stringify(r) + '\n', 'utf-8'); }
+  catch { /* best-effort */ }
+}
 
 export interface CurateResult {
   snapshot: string | null;
@@ -122,6 +139,24 @@ export async function curateCandidates(
   // Re-list after the merge pass so promotion sees the collapsed set.
   const liveCandidates = (await listCandidates()).filter(c => !mergedAway.has(c.name));
   const existingNames = liveCandidates.map(c => c.name);
+
+  // ── Escalating cascade setup ── Tier 1 is judgeRoute (the light/local judge). Tier 2 is an
+  // optional heavy model (learning.judgeModelTier2) we escalate to only when Tier 1 is unsure.
+  const tier2Model = String((config as any).learning?.judgeModelTier2 ?? '').trim();
+  let tier2Route: ReturnType<typeof router.route> | null = null;
+  if (tier2Model && tier2Model !== authorModel && tier2Model !== judgeRoute.model) {
+    try { tier2Route = router.route('reflection', 2000, { explicitModel: tier2Model }); } catch { tier2Route = null; }
+  }
+  // Calibration: if Tier 1 has been drifting from Tier 2, feed it the worst past corrections.
+  const driftRecords = await readDrift();
+  const calibration = buildCalibrationBlock(driftRecords, 3);
+
+  const scoreWith = async (route: NonNullable<typeof tier2Route>, md: string, calib: string): Promise<RubricScores | null> => {
+    const { system, user } = buildRubricPrompt(md, calib);
+    const text = await drainText(route.provider.complete({ model: route.model, messages: [{ role: 'system', content: system }, { role: 'user', content: user }], temperature: 0 } as any));
+    return parseRubricScores(text);
+  };
+
   for (const c of liveCandidates) {
     const md = await readCandidate(c.name);
     if (!md) { result.skipped.push({ name: c.name, reason: 'unreadable' }); continue; }
@@ -132,16 +167,25 @@ export async function curateCandidates(
       continue;
     }
 
-    log(`Judging "${c.name}" with ${judgeRoute.model} …`);
+    // ── Escalating cascade: Tier 1 (light) scores; escalate to Tier 2 (heavy) only when unsure ──
     let verdict;
     try {
-      const { system, user } = buildJudgePrompt(md, existingNames.filter(n => n !== c.name));
-      const text = await drainText(judgeRoute.provider.complete({
-        model: judgeRoute.model,
-        messages: [{ role: 'system', content: system }, { role: 'user', content: user }],
-        temperature: 0,
-      } as any));
-      verdict = parseJudgeVerdict(text, judgeRoute.model);
+      log(`Judging "${c.name}" with ${judgeRoute.model} (Tier 1) …`);
+      const t1 = await scoreWith(judgeRoute, md, calibration);
+      let finalScores = t1, finalModel = judgeRoute.model, reasons: string[] = [];
+      // Escalate when Tier 1 couldn't score (null) or is in the grey/confused zone, and a Tier 2 exists.
+      if (tier2Route && (!t1 || shouldEscalate(t1))) {
+        log(`  ↑ escalating "${c.name}" to ${tier2Route.model} (Tier 2)${t1 ? ` — avg/variance unclear` : ' — Tier 1 unparseable'} …`);
+        const t2 = await scoreWith(tier2Route, md, '');
+        if (t2) {
+          finalScores = t2; finalModel = tier2Route.model; reasons = [t2.justification].filter(Boolean);
+          if (t1) await appendDrift({ ts: new Date().toISOString(), tier1: t1, tier2: t2, drift: alignmentDrift(t1, t2) });
+        }
+      } else if (t1) {
+        reasons = [t1.justification].filter(Boolean);
+      }
+      if (!finalScores) { result.skipped.push({ name: c.name, reason: 'judge produced no parseable scores (Tier 1 + Tier 2)' }); continue; }
+      verdict = { pass: rubricToVerdict(finalScores), judgeModel: finalModel, reasons };
     } catch (e: any) {
       result.skipped.push({ name: c.name, reason: `judge call failed: ${e?.message}` });
       continue;
diff --git a/src/skills/learning/judge-cascade.ts b/src/skills/learning/judge-cascade.ts
new file mode 100644
index 0000000..61be476
--- /dev/null
+++ b/src/skills/learning/judge-cascade.ts
@@ -0,0 +1,130 @@
+/**
+ * Escalating cascade judge (Phase 5).
+ *
+ * Ensembling several heavy models per decision would crawl on local hardware. Instead we
+ * cascade: a fast LOCAL model (Tier 1) scores every candidate on a fixed rubric, and we
+ * escalate to a heavy CLOUD model (Tier 2) ONLY when Tier 1 is genuinely unsure —
+ * detected from the scores themselves:
+ *
+ *   - Twilight zone: the average lands in the grey middle (5.5–7.5).
+ *   - High variance: the rubric dimensions disagree sharply (σ > 2.5) — e.g. safety 10 but
+ *     efficiency 2 — which means the local model is confused, not confident.
+ *
+ * So ~90% of clear-cut cases finish locally in <2s; the cloud is paid only when it matters.
+ *
+ * A light SELF-IMPROVEMENT loop (Feedback Alignment Drift): whenever we escalate, we log
+ * |Tier2 − Tier1| per dimension. If that drift trends UP, the curator injects a few Tier-2
+ * corrections as few-shot examples into Tier-1's prompt so the local judge re-calibrates.
+ *
+ * All scoring math is PURE and unit-tested.
+ */
+import { tryParseJson } from '../../llm/constrained.js';
+
+export interface RubricScores {
+  readability: number;
+  efficiency: number;
+  completeness: number;
+  safety: number;
+  justification: string;
+}
+
+const DIMS = ['readability', 'efficiency', 'completeness', 'safety'] as const;
+
+function dimValues(s: RubricScores): number[] {
+  return [s.readability, s.efficiency, s.completeness, s.safety];
+}
+
+export function rubricAverage(s: RubricScores): number {
+  const v = dimValues(s);
+  return v.reduce((a, b) => a + b, 0) / v.length;
+}
+
+export function rubricStdDev(s: RubricScores): number {
+  const v = dimValues(s);
+  const avg = v.reduce((a, b) => a + b, 0) / v.length;
+  const variance = v.reduce((a, b) => a + (b - avg) ** 2, 0) / v.length;
+  return Math.sqrt(variance);
+}
+
+export interface EscalationConfig {
+  twilightLow: number;   // default 5.5
+  twilightHigh: number;  // default 7.5
+  maxStdDev: number;     // default 2.5
+}
+export const DEFAULT_ESCALATION: EscalationConfig = { twilightLow: 5.5, twilightHigh: 7.5, maxStdDev: 2.5 };
+
+/** Escalate to the heavy model when Tier 1 is in the grey middle OR its dimensions disagree. */
+export function shouldEscalate(scores: RubricScores, cfg: EscalationConfig = DEFAULT_ESCALATION): boolean {
+  const avg = rubricAverage(scores);
+  const inTwilight = avg >= cfg.twilightLow && avg <= cfg.twilightHigh;
+  const confused = rubricStdDev(scores) > cfg.maxStdDev;
+  return inTwilight || confused;
+}
+
+/** Map a rubric to a pass/fail verdict: pass only when clearly good and nothing is unsafe. */
+export function rubricToVerdict(scores: RubricScores, passAvg = 7.5, minSafety = 6): boolean {
+  return rubricAverage(scores) >= passAvg && scores.safety >= minSafety;
+}
+
+// ── prompt + parse ────────────────────────────────────────────────────────────
+
+export function buildRubricPrompt(candidateMd: string, calibrationExamples = ''): { system: string; user: string } {
+  const system =
+    'You are an independent reviewer scoring a machine-captured "skill" (a reusable playbook) ' +
+    'on FOUR dimensions, each 1–10:\n' +
+    '  - readability: is the playbook clear and well-structured?\n' +
+    '  - efficiency: is the approach it prescribes efficient (no needless steps)?\n' +
+    '  - completeness: does it cover the task class, not just one instance?\n' +
+    '  - safety: would following it avoid destructive / wrong actions?\n' +
+    'Be strict and honest; default low when unsure.' + (calibrationExamples ? `\n\n${calibrationExamples}` : '') +
+    '\n\nRespond with STRICT JSON only:\n' +
+    '{"readability":n,"efficiency":n,"completeness":n,"safety":n,"justification":"..."}';
+  const user = `## Candidate skill\n\`\`\`\n${candidateMd.slice(0, 8000)}\n\`\`\`\n\nScore it now.`;
+  return { system, user };
+}
+
+/** Parse rubric scores; clamps to 1–10. Returns null on unparseable output (caller treats
+ *  a missing Tier-1 score as "escalate"). */
+export function parseRubricScores(text: string): RubricScores | null {
+  const p = tryParseJson(text) as any;
+  if (!p || typeof p !== 'object') return null;
+  const num = (x: any) => (typeof x === 'number' && Number.isFinite(x) ? Math.max(1, Math.min(10, x)) : null);
+  const r = num(p.readability), e = num(p.efficiency), c = num(p.completeness), s = num(p.safety);
+  if (r === null || e === null || c === null || s === null) return null;
+  return { readability: r, efficiency: e, completeness: c, safety: s, justification: typeof p.justification === 'string' ? p.justification : '' };
+}
+
+// ── feedback alignment drift (self-improvement) ────────────────────────────────
+
+export interface DriftRecord { ts: string; tier1: RubricScores; tier2: RubricScores; drift: number }
+
+/** Mean absolute per-dimension difference between the two tiers — how far Tier 1 was off. */
+export function alignmentDrift(tier1: RubricScores, tier2: RubricScores): number {
+  const a = dimValues(tier1), b = dimValues(tier2);
+  return a.reduce((sum, x, i) => sum + Math.abs(x - b[i]!), 0) / a.length;
+}
+
+/**
+ * Is Tier 1 drifting OUT of alignment? Compares the mean drift of the most recent `window`
+ * escalations to the prior `window`. Rising drift ⇒ recalibrate Tier 1. PURE.
+ */
+export function isDriftRising(records: DriftRecord[], window = 10): boolean {
+  if (records.length < window * 2) return false;
+  const recent = records.slice(-window);
+  const prior = records.slice(-window * 2, -window);
+  const mean = (rs: DriftRecord[]) => rs.reduce((a, r) => a + r.drift, 0) / rs.length;
+  return mean(recent) > mean(prior);
+}
+
+/** Build the few-shot calibration block from the worst recent disagreements, to inject into
+ *  Tier 1's prompt so it learns to score like Tier 2. Empty when there's nothing to learn. */
+export function buildCalibrationBlock(records: DriftRecord[], k = 3): string {
+  if (records.length === 0) return '';
+  const worst = [...records].sort((a, b) => b.drift - a.drift).slice(0, k);
+  if (worst.length === 0) return '';
+  const lines = ['CALIBRATION — on these, a senior reviewer scored differently than you tend to; match this calibration:'];
+  for (const r of worst) {
+    lines.push(`- correct scores: readability ${r.tier2.readability}, efficiency ${r.tier2.efficiency}, completeness ${r.tier2.completeness}, safety ${r.tier2.safety}${r.tier2.justification ? ` — ${r.tier2.justification.slice(0, 100)}` : ''}`);
+  }
+  return lines.join('\n');
+}
diff --git a/test/judge-cascade.test.ts b/test/judge-cascade.test.ts
new file mode 100644
index 0000000..89c7c40
--- /dev/null
+++ b/test/judge-cascade.test.ts
@@ -0,0 +1,78 @@
+import { describe, it, expect } from 'vitest';
+import {
+  rubricAverage, rubricStdDev, shouldEscalate, rubricToVerdict, parseRubricScores,
+  alignmentDrift, isDriftRising, buildCalibrationBlock, type RubricScores, type DriftRecord,
+} from '../src/skills/learning/judge-cascade.js';
+
+const R = (readability: number, efficiency: number, completeness: number, safety: number, justification = ''): RubricScores =>
+  ({ readability, efficiency, completeness, safety, justification });
+
+describe('rubric math', () => {
+  it('average + stdDev', () => {
+    expect(rubricAverage(R(8, 8, 8, 8))).toBe(8);
+    expect(rubricStdDev(R(8, 8, 8, 8))).toBe(0);
+    expect(rubricStdDev(R(10, 2, 9, 8))).toBeGreaterThan(2.5);
+  });
+});
+
+describe('shouldEscalate — escalate ONLY when Tier 1 is unsure', () => {
+  it('twilight zone (grey middle average) → escalate', () => {
+    expect(shouldEscalate(R(6, 7, 6, 7))).toBe(true);   // avg 6.5, low variance
+  });
+  it('confidently HIGH (clear pass) → do NOT escalate', () => {
+    expect(shouldEscalate(R(9, 9, 8, 9))).toBe(false);  // avg 8.75, low variance
+  });
+  it('confidently LOW (clear reject) → do NOT escalate', () => {
+    expect(shouldEscalate(R(2, 3, 2, 2))).toBe(false);  // avg 2.25, low variance
+  });
+  it('HIGH VARIANCE (dimensions disagree) → escalate even outside twilight', () => {
+    expect(shouldEscalate(R(10, 2, 9, 8))).toBe(true);  // confused: safety/efficiency clash
+  });
+});
+
+describe('rubricToVerdict', () => {
+  it('pass only when clearly good AND safe', () => {
+    expect(rubricToVerdict(R(9, 8, 8, 9))).toBe(true);
+    expect(rubricToVerdict(R(9, 9, 9, 3))).toBe(false);  // unsafe → fail despite high avg
+    expect(rubricToVerdict(R(6, 6, 6, 6))).toBe(false);  // mediocre → fail
+  });
+});
+
+describe('parseRubricScores — clamps, fails closed', () => {
+  it('parses + clamps to 1–10', () => {
+    expect(parseRubricScores('{"readability":8,"efficiency":7,"completeness":9,"safety":8,"justification":"ok"}'))
+      .toEqual(R(8, 7, 9, 8, 'ok'));
+    expect(parseRubricScores('{"readability":99,"efficiency":-5,"completeness":9,"safety":8}')!.readability).toBe(10);
+    expect(parseRubricScores('{"readability":99,"efficiency":-5,"completeness":9,"safety":8}')!.efficiency).toBe(1);
+  });
+  it('garbage / missing dimension → null (caller escalates)', () => {
+    expect(parseRubricScores('not json')).toBeNull();
+    expect(parseRubricScores('{"readability":8}')).toBeNull();
+  });
+});
+
+describe('feedback alignment drift — self-improvement signal', () => {
+  it('alignmentDrift is the mean per-dimension |diff|', () => {
+    expect(alignmentDrift(R(8, 8, 8, 8), R(8, 8, 8, 8))).toBe(0);
+    expect(alignmentDrift(R(10, 10, 10, 10), R(6, 6, 6, 6))).toBe(4);
+  });
+  const rec = (drift: number): DriftRecord => ({ ts: '', tier1: R(5, 5, 5, 5), tier2: R(5, 5, 5, 5), drift });
+  it('isDriftRising compares recent vs prior window', () => {
+    const rising = [...Array(10).fill(0).map(() => rec(1)), ...Array(10).fill(0).map(() => rec(3))];
+    expect(isDriftRising(rising, 10)).toBe(true);
+    const stable = Array(20).fill(0).map(() => rec(2));
+    expect(isDriftRising(stable, 10)).toBe(false);
+    expect(isDriftRising([rec(1)], 10)).toBe(false); // not enough data
+  });
+  it('buildCalibrationBlock surfaces the worst disagreements (Tier-2 scores)', () => {
+    const recs: DriftRecord[] = [
+      { ts: '', tier1: R(9, 9, 9, 9), tier2: R(3, 3, 3, 3, 'overfit one-off'), drift: 6 },
+      { ts: '', tier1: R(7, 7, 7, 7), tier2: R(7, 7, 7, 7), drift: 0 },
+    ];
+    const block = buildCalibrationBlock(recs, 1);
+    expect(block).toContain('CALIBRATION');
+    expect(block).toContain('safety 3');           // the Tier-2 correction
+    expect(block).toContain('overfit one-off');
+    expect(buildCalibrationBlock([], 3)).toBe('');
+  });
+});