From 437d4fcc4d2f13d4f227c38375974f68b82349e9 Mon Sep 17 00:00:00 2001 From: Louise Lau Date: Thu, 25 Jun 2026 22:36:13 +0800 Subject: [PATCH] =?UTF-8?q?feat(context):=20episodic=20memory=20=E2=80=94?= =?UTF-8?q?=20recall=20how=20a=20similar=20task=20was=20solved=20before?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Companion to user-preference modeling (#22). Instead of rediscovering a recurring job, QodeX records a lean episode after each objectively-successful task and, at the start of a new one, recalls the most SIMILAR past task on this project and injects a one-line reminder of what worked. Off by default (learning.episodicMemory.enabled). Clean + lightweight, same discipline as #22: - src/context/episodic-memory.ts (pure rank + thin I/O): records to ~/.qodex/episodes/.jsonl; rankEpisodes scores a query against past episodes by lexical TF-cosine (reuses the tested similarity primitives — no embedding dependency), excludes near-identical re-runs (score ≥ 0.98), returns top-K above a similarity threshold; buildEpisodeBlock renders a concise block. - SMART, not always-on: an unrelated task injects nothing (below threshold), and only a one-line summary is injected, never full transcripts. - Loop hooks: record at the objective-success point (alongside capture/dataset); inject the recalled block into the system prompt at run start. - The lexical ranker is a pure function, so an embedding-backed variant can be swapped in later without touching call sites. Also: completed the README Self-learning section with the episodic-memory step, its config, and a worked end-to-end example. Live-verified: recorded two episodes, a pagination-shaped query recalled the pagination episode (with files), an unrelated k8s query recalled nothing. Tests: 6 (similar-retrieval, unrelated→empty, self-match excluded, topK/sort, empty guards). typecheck + full suite (1189) + build green. --- README.md | 22 ++++++++ src/agent/loop.ts | 31 +++++++++++ src/config/defaults.ts | 12 +++++ src/context/episodic-memory.ts | 95 ++++++++++++++++++++++++++++++++++ test/episodic-memory.test.ts | 45 ++++++++++++++++ 5 files changed, 205 insertions(+) create mode 100644 src/context/episodic-memory.ts create mode 100644 test/episodic-memory.test.ts diff --git a/README.md b/README.md index c957c22..825f8c9 100644 --- a/README.md +++ b/README.md @@ -85,6 +85,9 @@ QodeX can **learn reusable playbooks from your successful tasks** — without th 3. **Independent review** — `qodex skill curate` runs an **independent judge model** (a *different* model from the one that did the work — a self-grade is refused) against a fixed rubric (reusable / correct / specific / non-redundant). Near-duplicate candidates are **merged** into one. It **never overwrites a human-authored skill**, and snapshots the skills dir (`tar.gz`) before any change so you can roll back. 4. **Auto-evaluation** — `qodex skill eval ` (or `learning.autoEval` to run it right after capture) **replays the skill's original task in a throwaway git worktree** and runs the **real** verifier (`tsc`/`ruff`/…) on the code it produces, recording **pass / fail / inconclusive** into the skill. It tests whether the skill actually *works*, not just whether a judge likes it. Content-hash cached. 5. **Learning from failures** — with `learning.failureLessons.enabled`, QodeX records tool failures and, once a mistake **recurs across tasks**, injects a deterministic "learned caution" into the prompt (e.g. *"verify a symbol exists before `edit_symbol`"*) so it stops repeating it. One-offs never teach; see `qodex skill lessons`. +6. **Episodic memory** — with `learning.episodicMemory.enabled`, QodeX records a lean episode after each successful task and, at the start of a new one, recalls the **most similar past task on this project** and injects a one-line reminder of what worked — so it reuses its own approach instead of rediscovering it. Smart retrieval: only the top match above a similarity threshold (an unrelated task recalls nothing). + +QodeX also **auto-matches your code style** (indentation, quotes, semicolons, naming — inferred from the project + `.editorconfig`) so generated code blends in without you having to spell it out. Off via `context.styleProfile: false`. ```yaml # ~/.qodex/config.yaml — opt in @@ -96,6 +99,25 @@ learning: autoEval: false # run `skill eval` automatically after each capture failureLessons: enabled: true # learn from RECURRING tool failures + episodicMemory: + enabled: true # recall the most similar past task and reuse what worked +``` + +**A worked example.** With `learning.enabled` + an independent `judgeModel`, a typical loop: + +```text +> add cursor pagination to the /orders endpoint # you give a task +… QodeX edits, type-checks, tests, and the sandbox merges (objective success) … +🎓 Captured candidate skill "add-cursor-pagination" (confidence 82/100) +🧪 Auto-eval of "add-cursor-pagination": pass # (if learning.autoEval) + +$ qodex skill candidates # review the quarantined capture +$ qodex skill curate # an INDEPENDENT judge promotes/merges the good ones +$ qodex skill stats # captured 3 · promoted 2 · promotion rate 67% + +# next week, a similar task: +> add pagination to the /users endpoint +# → QodeX recalls the past episode + loads the promoted skill automatically. ``` ```bash diff --git a/src/agent/loop.ts b/src/agent/loop.ts index fadd354..5ff614f 100644 --- a/src/agent/loop.ts +++ b/src/agent/loop.ts @@ -566,6 +566,23 @@ export class AgentLoop { } } + // ── Episodic memory: recall the most SIMILAR past task on this project ── + // Smart retrieval (top-K above a similarity threshold), concise injection. An unrelated + // task injects nothing. Opt-in via learning.episodicMemory.enabled. + const emCfg = (this.config as any).learning?.episodicMemory; + if (emCfg?.enabled && mode !== 'plan') { + try { + const { loadEpisodeBlock } = await import('../context/episodic-memory.js'); + const block = await loadEpisodeBlock(this.cwd, String(userPrompt), { + topK: emCfg.topK ?? 2, + minScore: emCfg.minSimilarity ?? 0.18, + }); + if (block) { sysPrompt += `\n\n${block}`; logger.info('Episodic memory injected'); } + } catch (e: any) { + logger.debug('Episodic recall skipped', { err: e?.message }); + } + } + // ── Failure-driven learning: inject cautions mined from RECURRING past failures ── // Deterministic, bounded, opt-in. Also stamp this run's task key so failures we // record below are attributable to a distinct task (the repetition gate counts tasks). @@ -698,6 +715,20 @@ export class AgentLoop { logger.debug('Dataset export skipped', { err: e?.message }); } } + // ── Episodic memory: record a lean "how I solved this" episode for later recall ── + if ((this.config as any).learning?.episodicMemory?.enabled) { + try { + const { recordEpisode } = await import('../context/episodic-memory.js'); + await recordEpisode(this.cwd, { + prompt: String(firstUserMsg), + summary: finalContent.slice(0, 300), + filesChanged: changedFiles, + toolsUsed: [...this.sessionToolNames], + }); + } catch (e: any) { + logger.debug('Episode record skipped', { err: e?.message }); + } + } } // ── Skill-learning: capture a CANDIDATE skill (opt-in, quarantined) ── // We're on the objectively-successful path: the sandbox compiled and squash-merged, diff --git a/src/config/defaults.ts b/src/config/defaults.ts index 40ba798..5e135e0 100644 --- a/src/config/defaults.ts +++ b/src/config/defaults.ts @@ -441,6 +441,18 @@ export interface QodexConfig { /** `qodex skill eval` cache TTL in hours — skip re-evaluating an unchanged skill * within this window. Default 24. */ evalCacheTtlHours?: number; + /** + * Episodic memory — record a lean episode after each successful task and, at the start + * of a new one, inject the most SIMILAR past episode(s) so the agent reuses its own + * proven approach. Smart retrieval (top-K above a threshold), concise injection. + */ + episodicMemory?: { + enabled?: boolean; + /** How many past episodes to inject. Default 2. */ + topK?: number; + /** Min lexical similarity (0–1) to inject — below this, nothing. Default 0.18. */ + minSimilarity?: number; + }; /** * Failure-driven learning — record tool failures and, once a pattern RECURS across * tasks, inject a deterministic "learned caution" into the system prompt so the agent diff --git a/src/context/episodic-memory.ts b/src/context/episodic-memory.ts new file mode 100644 index 0000000..90c5d50 --- /dev/null +++ b/src/context/episodic-memory.ts @@ -0,0 +1,95 @@ +/** + * Episodic memory — "how did I solve a task like this before?" + * + * Companion to user-preference modeling: instead of the user re-explaining a recurring + * job, QodeX records a lean episode after each objectively-successful task and, at the + * start of a NEW task, retrieves the most SIMILAR past episode(s) for this project and + * injects a concise reminder into the prompt — so the agent reuses its own proven + * approach. Smart, not heavy: it injects only the top-K above a similarity threshold (an + * unrelated task injects nothing), and only a short summary, never full transcripts. + * + * v1 similarity is lexical TF-cosine (reusing the tested primitives from the skill-dedup + * code) — dependency-free and lightweight. The ranker is a pure function so an + * embedding-backed variant can be swapped in later without touching the call sites. + */ +import { promises as fs } from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { createHash } from 'crypto'; +import { logger } from '../utils/logger.js'; +import { tokenize, termFreq, cosineSim } from '../skills/learning/similarity.js'; + +export interface Episode { + ts: string; + /** The task prompt. */ + prompt: string; + /** A short summary of what worked. */ + summary: string; + filesChanged: string[]; + toolsUsed: string[]; +} + +export interface EpisodeMatch extends Episode { score: number } + +function episodesPath(projectRoot: string): string { + const hash = createHash('sha1').update(projectRoot).digest('hex').slice(0, 16); + return path.join(os.homedir(), '.qodex', 'episodes', `${hash}.jsonl`); +} + +/** Append one episode after an objectively-successful task. Best-effort. */ +export async function recordEpisode(projectRoot: string, rec: Omit): Promise { + try { + if (!rec.prompt.trim()) return; + const full = episodesPath(projectRoot); + await fs.mkdir(path.dirname(full), { recursive: true }); + await fs.appendFile(full, JSON.stringify({ ts: new Date().toISOString(), ...rec }) + '\n', 'utf-8'); + } catch (e: any) { + logger.debug('Episode not recorded', { err: e?.message }); + } +} + +/** Read this project's episodes (most recent `limit`). */ +export async function readEpisodes(projectRoot: string, limit = 500): Promise { + try { + const raw = await fs.readFile(episodesPath(projectRoot), 'utf-8'); + const lines = raw.split('\n').filter(l => l.trim()).slice(-limit); + return lines.map(l => { try { return JSON.parse(l) as Episode; } catch { return null; } }).filter(Boolean) as Episode[]; + } catch { + return []; + } +} + +/** + * Rank episodes against a query by lexical similarity (prompt + summary). PURE. + * Excludes near-identical re-runs of the exact same prompt (score ≥ 0.98) so "similar past + * work" doesn't just echo the current task back. Returns top-K with score ≥ minScore. + */ +export function rankEpisodes(query: string, episodes: Episode[], opts: { topK?: number; minScore?: number } = {}): EpisodeMatch[] { + const topK = opts.topK ?? 2; + const minScore = opts.minScore ?? 0.18; + const qv = termFreq(tokenize(query)); + if (qv.size === 0) return []; + const scored: EpisodeMatch[] = []; + for (const e of episodes) { + const score = cosineSim(qv, termFreq(tokenize(`${e.prompt} ${e.summary}`))); + if (score >= minScore && score < 0.98) scored.push({ ...e, score }); + } + return scored.sort((a, b) => b.score - a.score).slice(0, topK); +} + +/** Build the concise system-prompt block from matches. Empty when none. */ +export function buildEpisodeBlock(matches: EpisodeMatch[]): string { + if (matches.length === 0) return ''; + const lines = ['# Similar past work (your own, on this project)', '', + 'You have done comparable tasks here before. Reuse what worked — don\'t rediscover it:', '']; + for (const m of matches) { + const files = m.filesChanged.slice(0, 4).join(', '); + lines.push(`- **"${m.prompt.replace(/\s+/g, ' ').trim().slice(0, 80)}"** → ${m.summary.replace(/\s+/g, ' ').trim().slice(0, 160)}${files ? ` (touched: ${files})` : ''}`); + } + return lines.join('\n'); +} + +/** Convenience: read → rank → build the injectable block for a query. Used by the loop. */ +export async function loadEpisodeBlock(projectRoot: string, query: string, opts: { topK?: number; minScore?: number } = {}): Promise { + return buildEpisodeBlock(rankEpisodes(query, await readEpisodes(projectRoot), opts)); +} diff --git a/test/episodic-memory.test.ts b/test/episodic-memory.test.ts new file mode 100644 index 0000000..1017c4c --- /dev/null +++ b/test/episodic-memory.test.ts @@ -0,0 +1,45 @@ +import { describe, it, expect } from 'vitest'; +import { rankEpisodes, buildEpisodeBlock, type Episode } from '../src/context/episodic-memory.js'; + +const ep = (prompt: string, summary: string, files: string[] = []): Episode => + ({ ts: '2026-06-25T00:00:00Z', prompt, summary, filesChanged: files, toolsUsed: [] }); + +const CORPUS: Episode[] = [ + ep('Add cursor pagination to the users REST endpoint', 'Parsed limit+cursor, returned next-cursor', ['src/users.ts']), + ep('Configure nightly Postgres backup to S3', 'pg_dump + gzip + aws s3 cp in a cron job', ['scripts/backup.sh']), + ep('Add a dark mode toggle to the navbar', 'CSS variables + a useTheme hook', ['src/Navbar.tsx']), +]; + +describe('rankEpisodes — retrieve the most SIMILAR past task', () => { + it('finds the pagination episode for a pagination-shaped query', () => { + const m = rankEpisodes('add pagination with a cursor to the products endpoint', CORPUS, { topK: 1, minScore: 0.1 }); + expect(m).toHaveLength(1); + expect(m[0]!.prompt).toMatch(/cursor pagination/); + }); + it('an UNRELATED query retrieves nothing (smart, not always-on)', () => { + expect(rankEpisodes('upgrade the kubernetes ingress controller', CORPUS, { minScore: 0.18 })).toHaveLength(0); + }); + it('excludes a near-identical re-run of the exact same task (score ~1)', () => { + const m = rankEpisodes('Add cursor pagination to the users REST endpoint', CORPUS, { topK: 3, minScore: 0.1 }); + expect(m.every(x => x.score < 0.98)).toBe(true); + }); + it('respects topK and sorts by score', () => { + const m = rankEpisodes('add backup and pagination to the database endpoint', CORPUS, { topK: 2, minScore: 0.05 }); + expect(m.length).toBeLessThanOrEqual(2); + for (let i = 1; i < m.length; i++) expect(m[i - 1]!.score).toBeGreaterThanOrEqual(m[i]!.score); + }); + it('empty query / empty corpus → no matches', () => { + expect(rankEpisodes('', CORPUS)).toHaveLength(0); + expect(rankEpisodes('anything', [])).toHaveLength(0); + }); +}); + +describe('buildEpisodeBlock — concise, bounded', () => { + it('renders matches with prompt + summary + files; empty when none', () => { + expect(buildEpisodeBlock([])).toBe(''); + const block = buildEpisodeBlock(rankEpisodes('add pagination cursor endpoint', CORPUS, { topK: 1, minScore: 0.1 })); + expect(block).toContain('# Similar past work'); + expect(block).toContain('cursor'); + expect(block).toContain('src/users.ts'); + }); +});