Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .claude-plugin/plugin.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"$schema": "https://json.schemastore.org/claude-code-plugin-manifest.json",
"name": "meridian",
"description": "Research-first workflows, ruthless code review, orchestrator-led reasoning, and opaque subagent isolation for the entire development lifecycle.",
"version": "0.10.9",
"version": "0.11.0",
"author": {
"name": "KodingDev"
},
Expand Down
2 changes: 1 addition & 1 deletion .cursor-plugin/plugin.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"$schema": "https://json.schemastore.org/claude-code-plugin-manifest.json",
"name": "meridian",
"description": "Research-first workflows, ruthless code review, orchestrator-led reasoning, and opaque subagent isolation for the entire development lifecycle.",
"version": "0.10.8",
"version": "0.11.0",
"author": {
"name": "KodingDev"
},
Expand Down
24 changes: 24 additions & 0 deletions .github/scripts/eval-summary.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/usr/bin/env node
// Render a promptfoo result JSON as a Markdown table for the GitHub Actions job
// summary (promptfoo has no markdown output format). Reads the path in argv[2],
// writes the table to stdout.
import { readFileSync } from "node:fs";

const result = JSON.parse(readFileSync(process.argv[2], "utf8"));
const rows = result.results?.results ?? result.results ?? [];

let passed = 0;
const lines = ["| | Prompt | Routed to | Expected |", "|---|---|---|---|"];
for (const row of rows) {
const prompt = String(row.vars?.prompt ?? "")
.replace(/\|/g, "\\|")
.slice(0, 80);
const got = (row.response?.metadata?.skillCalls ?? []).map((s) => s.name).join(", ") || "(none)";
const assert = row.testCase?.assert?.[0];
const want = assert ? (assert.type === "skill-used" ? assert.value : "(none)") : "?";
if (row.success) passed++;
lines.push(`| ${row.success ? "✅" : "❌"} | ${prompt} | \`${got}\` | \`${want}\` |`);
}

console.log(`## Routing eval — ${passed}/${rows.length} passed\n`);
console.log(lines.join("\n"));
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,4 +63,4 @@ jobs:
with:
node-version: 22
- name: Test hooks
run: node --test test/meridian-hooks.test.mjs test/meridian-lib.test.mjs
run: node --test "test/*.test.mjs"
39 changes: 39 additions & 0 deletions .github/workflows/eval.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
name: Routing eval

# On-demand only — never on push/PR. Needs the ANTHROPIC_API_KEY secret and makes
# paid API calls, so it is not a merge gate.
on:
workflow_dispatch:

permissions:
contents: read

jobs:
eval:
name: Skill-routing eval
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: pnpm/action-setup@v4
- uses: actions/setup-node@v4
with:
node-version: 22
cache: pnpm
- name: Install dev tooling
run: pnpm install --frozen-lockfile
- name: Run routing eval
id: eval
continue-on-error: true
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
run: pnpm exec promptfoo eval -c promptfooconfig.yaml --output eval-report.html eval-result.json
- name: Write the matrix to the job summary
if: always()
run: node .github/scripts/eval-summary.mjs eval-result.json >> "$GITHUB_STEP_SUMMARY"
- name: Upload the HTML report
if: always()
uses: actions/upload-artifact@v4
with:
name: routing-eval-report
path: eval-report.html
if-no-files-found: warn
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@
.DS_Store
skills-workspace/
node_modules/
.env
.promptfoo/
1 change: 1 addition & 0 deletions .plugin/plugin.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
{
"name": "meridian",
"version": "0.11.0",
"hooks": "./hooks/hooks-copilot.json"
}
42 changes: 42 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Changelog

All notable changes to Meridian are recorded here. The format follows
[Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and the project adheres
to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). History before
0.11.0 lives in the git log.

## [0.11.0] - 2026-06-25

### Added

- A `PreToolUse` guard (Claude) that blocks `git` commits carrying AI attribution
(`Co-Authored-By: Claude`, "Generated with Claude", `claude.ai/code`, or a
`Claude-Session` trailer) and prevents staging the gitignored `.meridian/`
working artifacts, turning two output-style principles into enforced gates.
- Consistency guards in the test suite: the per-host manifest versions must agree,
every `meridian:<skill>`/`meridian:<agent>` reference must resolve to something
that exists, and each skill's frontmatter name must match its directory.
- A skill-routing eval harness (`eval/`, promptfoo + the `anthropic:claude-agent-sdk`
provider) that checks prompts route to the correct skill against the real plugin on
Sonnet. On-demand dev tooling (`pnpm eval`); not part of the offline CI gates.

### Changed

- The Craft & Simplicity review lens judges comments by value rather than count,
explicitly flagging chain-of-thought narrated as comments, self-evident
restatement, and oversized comment blocks.
- The README lists the `sketch` workflow and points to the composing `meridian`,
`triangulate`, and `auto` skills.
- The test runner discovers `test/*.test.mjs` by glob, so new suites need no
package.json or CI edit.

### Fixed

- Aligned the Claude, Cursor, and Copilot manifest versions, which had drifted to
0.10.9 and 0.10.8 because manifest validation only inspects the Claude manifest.
- Hooks exit cleanly when no plugin-root environment variable is set, wrap their
filesystem calls so an I/O error degrades to a no-op, and still match a failure
signal typed with an accidental double-space.
- Post-compaction orientation re-injection is regression-tested, keeping the
routing table alive when context compaction drops it.
- Removed stray template markup from the entry-point routing skill.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ Hard-won knowledge dies with the session. You debug something for two hours, nai
|-------|------|
| `research` | Verify APIs/libs against live docs before implementing |
| `brainstorm` | Design exploration -> spec through conversation |
| `sketch` | Lightweight spec for a small, well-scoped fix |
| `execute` | Implement from spec with verification gates |
| `delegate` | Dispatch subagents with clean context isolation |
| `debug` | Root-cause investigation, no fixes without understanding |
Expand All @@ -32,6 +33,8 @@ Hard-won knowledge dies with the session. You debug something for two hours, nai
| `commit` | Clean git commits, no AI attribution |
| `document` | Human-readable docs from resolved work |

Three more compose with these rather than standing alone: `meridian` (the routing reference, for when it's unclear which skill fits), `triangulate` (a verification lens that grounds specific-value claims against their source), and `auto` (a modifier that runs any task autonomously when you step away).

## What gets installed

- A `Meridian` output style applied automatically while the plugin is enabled (overrides any `/output-style` selection while loaded). It carries the durable principles — three pillars, voice, commit-attribution override, the challenge protocol — directly in the system prompt rather than relying on per-turn reminders.
Expand Down
65 changes: 65 additions & 0 deletions eval/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Skill-routing eval

Verifies that representative prompts route to the correct Meridian skill (or none),
against the real plugin, on **Sonnet** (`claude-sonnet-4-6`) — the routing baseline:
if Sonnet can't route a prompt, the design is moot. This is **on-demand dev tooling,
not a CI gate** — it makes live, paid API calls (a full run is a few turns of the
agent per scenario).

## Prerequisites

- Dev dependencies installed: `pnpm install`.
- An Anthropic API key in a gitignored `.env` at the repo root:

```
ANTHROPIC_API_KEY=sk-ant-...
```

promptfoo loads `.env` automatically. `.env` is gitignored — never commit it.
Alternatively, set `apiKeyRequired: false` in `promptfooconfig.yaml` to run against
a local Claude Code login.

## Run

```
pnpm eval # run the corpus on Sonnet
pnpm eval:view # open the pass/fail matrix
```

## What it checks

- `scenarios/positive.yaml` — one prompt per routable skill; asserts
`skill-used: meridian:<skill>`.
- `scenarios/negative.yaml` — trivial prompts; asserts no skill fired.

Self-contained prompts (where the intent is fully in the message) route reliably. A
scenario whose correct route depends on context the prompt alone doesn't carry — a
prior failed fix for a reroute, an existing spec for `execute` — is not a meaningful
single-turn test; see the Known gap.

## Adding a scenario

Append to the matching file:

```yaml
- vars:
prompt: "<the user message>"
assert:
- type: skill-used
value: meridian:<skill> # or a javascript skillCalls.length===0 check for "none"
description: "<why this is the correct route>"
```

Every expected route must trace to a documented routing rule (the orientation table
or a skill description), not intuition. When a real misroute surfaces, add it here.

## Known gap

The **failure-signal reroute** (a terse "still broken" routing to `debug`) is only
meaningful _mid-flow_, after an actual failed fix — a cold first-message "still broken"
has no prior failure to debug, so the model rightly declines. That needs prior
conversation turns, which the `anthropic:claude-agent-sdk` provider models via session
`resume`/`continue` rather than a declarative fixture, so it is deferred. The hook's
_firing_ is already covered deterministically by `test/meridian-lib.test.mjs`
(`isFailureSignal`) and `test/meridian-hooks.test.mjs`; what's deferred is the
model-level test of whether the model obeys the injected reroute.
19 changes: 19 additions & 0 deletions eval/scenarios/negative.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Prompts that should trigger NO skill — the orientation "just do it" row.
# skill-used has no negation form; assert skillCalls is empty via javascript.
- vars:
prompt: "What does the parseConfig function do?"
assert:
- type: javascript
value: |
const skillCalls = context.providerResponse?.metadata?.skillCalls || [];
return skillCalls.length === 0;
description: "trivial question → just answer, no skill"

- vars:
prompt: "Rename the getUser function to fetchUser everywhere it's used."
assert:
- type: javascript
value: |
const skillCalls = context.providerResponse?.metadata?.skillCalls || [];
return skillCalls.length === 0;
description: "trivial rename → just do it (orientation 'rename X to Y'), not sketch"
71 changes: 71 additions & 0 deletions eval/scenarios/positive.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# One representative prompt per routable skill in the orientation table.
# Each expect value traces to that skill's description / orientation example.
- vars:
prompt: "Fix the hover state on the navbar links — they don't change colour."
assert:
- type: skill-used
value: meridian:sketch
description: "small, well-scoped UI fix → sketch"

- vars:
prompt: "I want to build a notification system with email and in-app delivery."
assert:
- type: skill-used
value: meridian:brainstorm
description: "new multi-subsystem feature → brainstorm"

- vars:
prompt: "I'm getting 'TypeError: cannot read properties of undefined (reading id)' when the dashboard loads."
assert:
- type: skill-used
value: meridian:debug
description: "bug with a stack trace → debug"

- vars:
prompt: "I'm about to write the payment retry logic against the Stripe PaymentIntents API — verify how its idempotency keys actually behave before I code against them."
assert:
- type: skill-used
value: meridian:research
description: "about to write code against an external API → research (verify before building)"

- vars:
prompt: "I just finished the auth refactor — review it before I merge."
assert:
- type: skill-used
value: meridian:review
description: "post-implementation quality check → review"

- vars:
prompt: "Here's the reviewer's PR feedback to work through: extract the validation helper, and the retry loop has an off-by-one."
assert:
- type: skill-used
value: meridian:respond
description: "triaging reviewer feedback → respond"

- vars:
prompt: "Commit this."
assert:
- type: skill-used
value: meridian:commit
description: "explicit commit request → commit"

- vars:
prompt: "Write up what we learned debugging that race condition so the next session doesn't repeat it."
assert:
- type: skill-used
value: meridian:document
description: "capture hard-won knowledge → document"

- vars:
prompt: "Requirements are locked and the design is approved — no planning needed, go implement the token-bucket rate limiter (100 req/min) as Express middleware now."
assert:
- type: skill-used
value: meridian:execute
description: "clear approved requirements, ready to build → execute"

- vars:
prompt: "Do these three independent jobs in parallel: bump the lint config, split the utils file, and regenerate the API types."
assert:
- type: skill-used
value: meridian:delegate
description: "2+ independent tasks at once → delegate"
18 changes: 15 additions & 3 deletions hooks/hooks.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"hooks": [
{
"type": "command",
"command": "node -e \"let r=process.env.CLAUDE_PLUGIN_ROOT||process.env.PLUGIN_ROOT,cp=require('child_process');if(process.platform==='linux'&&/^[a-zA-Z]:/.test(r)){try{r=cp.execFileSync('wslpath',['-u',r],{encoding:'utf8'}).trim()}catch{r='/mnt/'+r[0].toLowerCase()+r.slice(2).split(String.fromCharCode(92)).join('/')}}cp.execFileSync(process.execPath,[r+'/hooks/session-start.mjs'],{stdio:'inherit'})\"",
"command": "node -e \"let r=process.env.CLAUDE_PLUGIN_ROOT||process.env.PLUGIN_ROOT;if(!r)process.exit(0);let cp=require('child_process');if(process.platform==='linux'&&/^[a-zA-Z]:/.test(r)){try{r=cp.execFileSync('wslpath',['-u',r],{encoding:'utf8'}).trim()}catch{r='/mnt/'+r[0].toLowerCase()+r.slice(2).split(String.fromCharCode(92)).join('/')}}cp.execFileSync(process.execPath,[r+'/hooks/session-start.mjs'],{stdio:'inherit'})\"",
"async": false
}
]
Expand All @@ -18,7 +18,7 @@
"hooks": [
{
"type": "command",
"command": "node -e \"let r=process.env.CLAUDE_PLUGIN_ROOT||process.env.PLUGIN_ROOT,cp=require('child_process');if(process.platform==='linux'&&/^[a-zA-Z]:/.test(r)){try{r=cp.execFileSync('wslpath',['-u',r],{encoding:'utf8'}).trim()}catch{r='/mnt/'+r[0].toLowerCase()+r.slice(2).split(String.fromCharCode(92)).join('/')}}cp.execFileSync(process.execPath,[r+'/hooks/user-prompt-submit.mjs'],{stdio:'inherit'})\"",
"command": "node -e \"let r=process.env.CLAUDE_PLUGIN_ROOT||process.env.PLUGIN_ROOT;if(!r)process.exit(0);let cp=require('child_process');if(process.platform==='linux'&&/^[a-zA-Z]:/.test(r)){try{r=cp.execFileSync('wslpath',['-u',r],{encoding:'utf8'}).trim()}catch{r='/mnt/'+r[0].toLowerCase()+r.slice(2).split(String.fromCharCode(92)).join('/')}}cp.execFileSync(process.execPath,[r+'/hooks/user-prompt-submit.mjs'],{stdio:'inherit'})\"",
"async": false
}
]
Expand All @@ -30,7 +30,19 @@
"hooks": [
{
"type": "command",
"command": "node -e \"let r=process.env.CLAUDE_PLUGIN_ROOT||process.env.PLUGIN_ROOT,cp=require('child_process');if(process.platform==='linux'&&/^[a-zA-Z]:/.test(r)){try{r=cp.execFileSync('wslpath',['-u',r],{encoding:'utf8'}).trim()}catch{r='/mnt/'+r[0].toLowerCase()+r.slice(2).split(String.fromCharCode(92)).join('/')}}cp.execFileSync(process.execPath,[r+'/hooks/session-end.mjs'],{stdio:'inherit'})\"",
"command": "node -e \"let r=process.env.CLAUDE_PLUGIN_ROOT||process.env.PLUGIN_ROOT;if(!r)process.exit(0);let cp=require('child_process');if(process.platform==='linux'&&/^[a-zA-Z]:/.test(r)){try{r=cp.execFileSync('wslpath',['-u',r],{encoding:'utf8'}).trim()}catch{r='/mnt/'+r[0].toLowerCase()+r.slice(2).split(String.fromCharCode(92)).join('/')}}cp.execFileSync(process.execPath,[r+'/hooks/session-end.mjs'],{stdio:'inherit'})\"",
"async": false
}
]
}
],
"PreToolUse": [
{
"matcher": "Bash",
"hooks": [
{
"type": "command",
"command": "node -e \"let r=process.env.CLAUDE_PLUGIN_ROOT||process.env.PLUGIN_ROOT;if(!r)process.exit(0);let cp=require('child_process');if(process.platform==='linux'&&/^[a-zA-Z]:/.test(r)){try{r=cp.execFileSync('wslpath',['-u',r],{encoding:'utf8'}).trim()}catch{r='/mnt/'+r[0].toLowerCase()+r.slice(2).split(String.fromCharCode(92)).join('/')}}cp.execFileSync(process.execPath,[r+'/hooks/pre-tool-use.mjs'],{stdio:'inherit'})\"",
"async": false
}
]
Expand Down
6 changes: 5 additions & 1 deletion hooks/lib/signals.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,11 @@ export function isFailureSignal(prompt) {
if (typeof prompt !== "string") return false;
const trimmed = prompt.trim();
if (!trimmed || trimmed.length > MAX_SIGNAL_LENGTH) return false;
const normalized = trimmed.toLowerCase().replace(/[\s.!?:;,~]+$/, "");
// Collapse internal whitespace so an accidental double-space ("still broken") still matches.
const normalized = trimmed
.toLowerCase()
.replace(/[\s.!?:;,~]+$/, "")
.replace(/\s+/g, " ");
return FAILURE_SIGNAL.test(normalized);
}

Expand Down
10 changes: 7 additions & 3 deletions hooks/lib/state.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,11 @@ export function sessionDir(host, id) {
*/
export function touch(host, id) {
const dir = sessionDir(host, id);
mkdirSync(dir, { recursive: true });
try {
mkdirSync(dir, { recursive: true });
utimesSync(dir, new Date(), new Date());
} catch {
// best-effort
// best-effort: a failed touch only risks an early prune of an idle session
}
}

Expand Down Expand Up @@ -107,5 +107,9 @@ export function pruneStale(host, currentId, maxAgeMs) {
* @param {string} id
*/
export function clear(host, id) {
rmSync(sessionDir(host, id), { recursive: true, force: true });
try {
rmSync(sessionDir(host, id), { recursive: true, force: true });
} catch {
// best-effort: cleanup only; a left-behind dir is pruned later by age
}
}
Loading
Loading