From 62176c25f390e071e80aea7c0776dca64556c92d Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Fri, 19 Jun 2026 13:20:04 +0200
Subject: [PATCH] fix(evals): require self-eval guidance reads

---
 evals/agentv-self/agentv-self.eval.yaml       |  63 +++++--
 .../graders/required-file-reads.ts            | 177 ++++++++++++++++++
 2 files changed, 220 insertions(+), 20 deletions(-)
 create mode 100644 evals/agentv-self/graders/required-file-reads.ts

diff --git a/evals/agentv-self/agentv-self.eval.yaml b/evals/agentv-self/agentv-self.eval.yaml
index 8125eea4..33ab95b6 100644
--- a/evals/agentv-self/agentv-self.eval.yaml
+++ b/evals/agentv-self/agentv-self.eval.yaml
@@ -1,8 +1,8 @@
 name: agentv-self-guidance
 description: >-
   Validates the current AgentV repo guidance split, repo-change preflight,
-  and wire-format naming rules using the live AGENTS index and linked
-  `.agents/*.md` files copied into the eval workspace.
+  and wire-format naming rules by requiring the agent to inspect the live
+  AGENTS index and linked `.agents/*.md` files from the prepared eval workspace.
 
 tags: [agent, guidance]
 
@@ -16,29 +16,32 @@ workspace:
 input:
   - role: user
     content:
-      - type: file
-        value: /AGENTS.md
-      - type: file
-        value: /.agents/product-boundary.md
-      - type: file
-        value: /.agents/workflow.md
-      - type: file
-        value: /.agents/verification.md
-      - type: file
-        value: /.agents/conventions.md
       - type: text
         value: |
-          The files above are the current AgentV repo guidance. Use them as the
-          source of truth for the next request.
+          The eval harness has prepared the current AgentV repo in your working
+          directory. Follow the repo-facing agent instructions from disk. Start
+          with AGENTS.md and read any linked guidance needed for the request.
 
 tests:
   - id: guidance-split-paths
-    criteria: Agent points detailed workflow, verification, and conventions questions at the linked `.agents` docs.
+    criteria: Agent points detailed workflow, verification, and conventions questions at the linked `.agents` docs after reading the relevant guidance files.
     input: |
       List the three relative file paths that contain the detailed rules for
-      workflow, verification, and conventions.
-      Reply with one path per line and nothing else.
+      workflow, verification, and conventions. Include the top-level title from
+      each file after the path.
+      Reply with one path and title per line and nothing else.
     assertions:
+      - name: required-guidance-reads
+        type: code-grader
+        command: ["bun", "run", "./graders/required-file-reads.ts"]
+        cwd: .
+        required: true
+        config:
+          required_files:
+            - AGENTS.md
+            - .agents/workflow.md
+            - .agents/verification.md
+            - .agents/conventions.md
       - type: contains
         value: .agents/workflow.md
       - type: contains
@@ -51,20 +54,40 @@ tests:
           - Uses relative repo paths, not prose summaries
 
   - id: repo-change-preflight
-    criteria: Agent names the required git commands that must start every repo change.
+    criteria: Agent names the required git commands that must start every repo change after reading the repo workflow guidance.
     input: |
       What two exact shell commands must start every repo change in this repository?
       Reply with one command per line and nothing else.
     assertions:
+      - name: required-guidance-reads
+        type: code-grader
+        command: ["bun", "run", "./graders/required-file-reads.ts"]
+        cwd: .
+        required: true
+        config:
+          required_files:
+            - AGENTS.md
+            - .agents/workflow.md
       - type: contains
         value: git fetch origin
       - type: contains
         value: git status --short --branch
 
   - id: wire-format-snake-case
-    criteria: Agent keeps wire-format keys in snake_case.
+    criteria: Agent keeps wire-format keys in snake_case after reading the repo conventions guidance.
     input: |
       What key naming convention should anything that crosses a process boundary use
       in this repository?
       Reply with just the convention name.
-    expected_output: snake_case
+    assertions:
+      - name: required-guidance-reads
+        type: code-grader
+        command: ["bun", "run", "./graders/required-file-reads.ts"]
+        cwd: .
+        required: true
+        config:
+          required_files:
+            - AGENTS.md
+            - .agents/conventions.md
+      - type: equals
+        value: snake_case
diff --git a/evals/agentv-self/graders/required-file-reads.ts b/evals/agentv-self/graders/required-file-reads.ts
new file mode 100644
index 00000000..e1249d83
--- /dev/null
+++ b/evals/agentv-self/graders/required-file-reads.ts
@@ -0,0 +1,177 @@
+#!/usr/bin/env bun
+/**
+ * Verifies that an agent inspected required repo guidance files from the
+ * prepared workspace. This is intentionally suite-local: the general primitive
+ * is code-grader, while this file encodes AgentV self-eval expectations.
+ */
+import { type Message, type ToolCall, type TraceEvent, defineCodeGrader } from '@agentv/sdk';
+
+type Assertion = { text: string; passed: boolean; evidence?: string };
+
+function stringArray(value: unknown): string[] {
+  return Array.isArray(value)
+    ? value.filter((item): item is string => typeof item === 'string')
+    : [];
+}
+
+function normalizePath(value: string): string {
+  return value
+    .replace(/^file:\/\//, '')
+    .replace(/\\/g, '/')
+    .replace(/^['"]|['"]$/g, '')
+    .replace(/\/+/g, '/')
+    .replace(/^\.\//, '')
+    .replace(/^\/+/, '');
+}
+
+function pathMatches(candidate: string, expected: string): boolean {
+  const normalizedCandidate = normalizePath(candidate);
+  const normalizedExpected = normalizePath(expected);
+  return (
+    normalizedCandidate === normalizedExpected ||
+    normalizedCandidate.endsWith(`/${normalizedExpected}`)
+  );
+}
+
+function textMentionsPath(text: string, expected: string): boolean {
+  const normalizedText = normalizePath(text);
+  const normalizedExpected = normalizePath(expected);
+  return (
+    normalizedText.includes(normalizedExpected) || normalizedText.includes(`/${normalizedExpected}`)
+  );
+}
+
+function asRecord(value: unknown): Record<string, unknown> {
+  return value && typeof value === 'object' && !Array.isArray(value)
+    ? (value as Record<string, unknown>)
+    : {};
+}
+
+function toolCallsFromMessages(messages: readonly Message[] | undefined): ToolCall[] {
+  return (messages ?? []).flatMap((message) => [...(message.toolCalls ?? [])]);
+}
+
+function toolCallsFromTrace(events: readonly TraceEvent[] | undefined): ToolCall[] {
+  return (events ?? [])
+    .filter((event) => event.type === 'tool_call' && event.tool)
+    .map((event) => ({
+      tool: event.tool?.name ?? '',
+      input: event.tool?.input,
+      output: event.tool?.output,
+      id: event.tool?.callId,
+      durationMs: event.durationMs,
+    }));
+}
+
+function allToolCalls(
+  messages: readonly Message[],
+  trace: { events?: readonly TraceEvent[] } | null | undefined,
+): ToolCall[] {
+  const seen = new Set<string>();
+  const calls = [...toolCallsFromMessages(messages), ...toolCallsFromTrace(trace?.events)];
+
+  return calls.filter((call) => {
+    const key = JSON.stringify([call.tool, call.id, call.input, call.output]);
+    if (seen.has(key)) return false;
+    seen.add(key);
+    return true;
+  });
+}
+
+function readPathFromCall(call: ToolCall): string | undefined {
+  const toolName = call.tool.toLowerCase();
+  if (toolName !== 'read' && toolName !== 'read_file' && toolName !== 'readfile') {
+    return undefined;
+  }
+
+  const input = asRecord(call.input);
+  const path = input.file_path ?? input.path ?? input.filePath;
+  return typeof path === 'string' ? path : undefined;
+}
+
+function stringify(value: unknown): string {
+  if (typeof value === 'string') return value;
+  if (value === undefined || value === null) return '';
+  try {
+    return JSON.stringify(value);
+  } catch {
+    return String(value);
+  }
+}
+
+function bashCommand(call: ToolCall): string {
+  const input = asRecord(call.input);
+  const command = input.command ?? input.cmd ?? input.shell_command;
+  return typeof command === 'string' ? command : '';
+}
+
+function commandReadsFiles(command: string): boolean {
+  if (/\b(cat|sed|awk|head|tail|less|more|bat|nl)\b/.test(command)) {
+    return true;
+  }
+
+  if (/\b(rg|grep)\b/.test(command)) {
+    return !/\b--files\b/.test(command);
+  }
+
+  return false;
+}
+
+function evidenceForRequiredFile(call: ToolCall, expectedFile: string): string | undefined {
+  const directPath = readPathFromCall(call);
+  if (directPath && pathMatches(directPath, expectedFile)) {
+    return `Read tool loaded ${directPath}`;
+  }
+
+  if (call.tool.toLowerCase() !== 'bash') {
+    return undefined;
+  }
+
+  const command = bashCommand(call);
+  if (!command || !commandReadsFiles(command)) {
+    return undefined;
+  }
+
+  const output = stringify(call.output);
+  if (textMentionsPath(command, expectedFile) || textMentionsPath(output, expectedFile)) {
+    const compactCommand = command.replace(/\s+/g, ' ').trim();
+    return `Bash command inspected ${expectedFile}: ${compactCommand.slice(0, 160)}`;
+  }
+
+  return undefined;
+}
+
+export default defineCodeGrader(({ config, messages, trace }) => {
+  const requiredFiles = stringArray(config?.requiredFiles ?? config?.required_files);
+
+  if (requiredFiles.length === 0) {
+    return {
+      score: 0,
+      assertions: [{ text: 'required_files config is missing or empty', passed: false }],
+    };
+  }
+
+  const toolCalls = allToolCalls(messages, trace);
+  const assertions: Assertion[] = requiredFiles.map((file) => {
+    const evidence = toolCalls
+      .map((call) => evidenceForRequiredFile(call, file))
+      .find((item): item is string => typeof item === 'string');
+
+    return evidence
+      ? { text: `Read required file: ${file}`, passed: true, evidence }
+      : {
+          text: `Read required file: ${file}`,
+          passed: false,
+          evidence:
+            toolCalls.length > 0
+              ? `Observed ${toolCalls.length} tool call(s), but none read ${file}`
+              : 'No tool calls recorded',
+        };
+  });
+
+  const passed = assertions.filter((assertion) => assertion.passed).length;
+  return {
+    score: passed / assertions.length,
+    assertions,
+  };
+});