From 62176c25f390e071e80aea7c0776dca64556c92d Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 19 Jun 2026 13:20:04 +0200 Subject: [PATCH] fix(evals): require self-eval guidance reads --- evals/agentv-self/agentv-self.eval.yaml | 63 +++++-- .../graders/required-file-reads.ts | 177 ++++++++++++++++++ 2 files changed, 220 insertions(+), 20 deletions(-) create mode 100644 evals/agentv-self/graders/required-file-reads.ts diff --git a/evals/agentv-self/agentv-self.eval.yaml b/evals/agentv-self/agentv-self.eval.yaml index 8125eea4..33ab95b6 100644 --- a/evals/agentv-self/agentv-self.eval.yaml +++ b/evals/agentv-self/agentv-self.eval.yaml @@ -1,8 +1,8 @@ name: agentv-self-guidance description: >- Validates the current AgentV repo guidance split, repo-change preflight, - and wire-format naming rules using the live AGENTS index and linked - `.agents/*.md` files copied into the eval workspace. + and wire-format naming rules by requiring the agent to inspect the live + AGENTS index and linked `.agents/*.md` files from the prepared eval workspace. tags: [agent, guidance] @@ -16,29 +16,32 @@ workspace: input: - role: user content: - - type: file - value: /AGENTS.md - - type: file - value: /.agents/product-boundary.md - - type: file - value: /.agents/workflow.md - - type: file - value: /.agents/verification.md - - type: file - value: /.agents/conventions.md - type: text value: | - The files above are the current AgentV repo guidance. Use them as the - source of truth for the next request. + The eval harness has prepared the current AgentV repo in your working + directory. Follow the repo-facing agent instructions from disk. Start + with AGENTS.md and read any linked guidance needed for the request. tests: - id: guidance-split-paths - criteria: Agent points detailed workflow, verification, and conventions questions at the linked `.agents` docs. + criteria: Agent points detailed workflow, verification, and conventions questions at the linked `.agents` docs after reading the relevant guidance files. input: | List the three relative file paths that contain the detailed rules for - workflow, verification, and conventions. - Reply with one path per line and nothing else. + workflow, verification, and conventions. Include the top-level title from + each file after the path. + Reply with one path and title per line and nothing else. assertions: + - name: required-guidance-reads + type: code-grader + command: ["bun", "run", "./graders/required-file-reads.ts"] + cwd: . + required: true + config: + required_files: + - AGENTS.md + - .agents/workflow.md + - .agents/verification.md + - .agents/conventions.md - type: contains value: .agents/workflow.md - type: contains @@ -51,20 +54,40 @@ tests: - Uses relative repo paths, not prose summaries - id: repo-change-preflight - criteria: Agent names the required git commands that must start every repo change. + criteria: Agent names the required git commands that must start every repo change after reading the repo workflow guidance. input: | What two exact shell commands must start every repo change in this repository? Reply with one command per line and nothing else. assertions: + - name: required-guidance-reads + type: code-grader + command: ["bun", "run", "./graders/required-file-reads.ts"] + cwd: . + required: true + config: + required_files: + - AGENTS.md + - .agents/workflow.md - type: contains value: git fetch origin - type: contains value: git status --short --branch - id: wire-format-snake-case - criteria: Agent keeps wire-format keys in snake_case. + criteria: Agent keeps wire-format keys in snake_case after reading the repo conventions guidance. input: | What key naming convention should anything that crosses a process boundary use in this repository? Reply with just the convention name. - expected_output: snake_case + assertions: + - name: required-guidance-reads + type: code-grader + command: ["bun", "run", "./graders/required-file-reads.ts"] + cwd: . + required: true + config: + required_files: + - AGENTS.md + - .agents/conventions.md + - type: equals + value: snake_case diff --git a/evals/agentv-self/graders/required-file-reads.ts b/evals/agentv-self/graders/required-file-reads.ts new file mode 100644 index 00000000..e1249d83 --- /dev/null +++ b/evals/agentv-self/graders/required-file-reads.ts @@ -0,0 +1,177 @@ +#!/usr/bin/env bun +/** + * Verifies that an agent inspected required repo guidance files from the + * prepared workspace. This is intentionally suite-local: the general primitive + * is code-grader, while this file encodes AgentV self-eval expectations. + */ +import { type Message, type ToolCall, type TraceEvent, defineCodeGrader } from '@agentv/sdk'; + +type Assertion = { text: string; passed: boolean; evidence?: string }; + +function stringArray(value: unknown): string[] { + return Array.isArray(value) + ? value.filter((item): item is string => typeof item === 'string') + : []; +} + +function normalizePath(value: string): string { + return value + .replace(/^file:\/\//, '') + .replace(/\\/g, '/') + .replace(/^['"]|['"]$/g, '') + .replace(/\/+/g, '/') + .replace(/^\.\//, '') + .replace(/^\/+/, ''); +} + +function pathMatches(candidate: string, expected: string): boolean { + const normalizedCandidate = normalizePath(candidate); + const normalizedExpected = normalizePath(expected); + return ( + normalizedCandidate === normalizedExpected || + normalizedCandidate.endsWith(`/${normalizedExpected}`) + ); +} + +function textMentionsPath(text: string, expected: string): boolean { + const normalizedText = normalizePath(text); + const normalizedExpected = normalizePath(expected); + return ( + normalizedText.includes(normalizedExpected) || normalizedText.includes(`/${normalizedExpected}`) + ); +} + +function asRecord(value: unknown): Record { + return value && typeof value === 'object' && !Array.isArray(value) + ? (value as Record) + : {}; +} + +function toolCallsFromMessages(messages: readonly Message[] | undefined): ToolCall[] { + return (messages ?? []).flatMap((message) => [...(message.toolCalls ?? [])]); +} + +function toolCallsFromTrace(events: readonly TraceEvent[] | undefined): ToolCall[] { + return (events ?? []) + .filter((event) => event.type === 'tool_call' && event.tool) + .map((event) => ({ + tool: event.tool?.name ?? '', + input: event.tool?.input, + output: event.tool?.output, + id: event.tool?.callId, + durationMs: event.durationMs, + })); +} + +function allToolCalls( + messages: readonly Message[], + trace: { events?: readonly TraceEvent[] } | null | undefined, +): ToolCall[] { + const seen = new Set(); + const calls = [...toolCallsFromMessages(messages), ...toolCallsFromTrace(trace?.events)]; + + return calls.filter((call) => { + const key = JSON.stringify([call.tool, call.id, call.input, call.output]); + if (seen.has(key)) return false; + seen.add(key); + return true; + }); +} + +function readPathFromCall(call: ToolCall): string | undefined { + const toolName = call.tool.toLowerCase(); + if (toolName !== 'read' && toolName !== 'read_file' && toolName !== 'readfile') { + return undefined; + } + + const input = asRecord(call.input); + const path = input.file_path ?? input.path ?? input.filePath; + return typeof path === 'string' ? path : undefined; +} + +function stringify(value: unknown): string { + if (typeof value === 'string') return value; + if (value === undefined || value === null) return ''; + try { + return JSON.stringify(value); + } catch { + return String(value); + } +} + +function bashCommand(call: ToolCall): string { + const input = asRecord(call.input); + const command = input.command ?? input.cmd ?? input.shell_command; + return typeof command === 'string' ? command : ''; +} + +function commandReadsFiles(command: string): boolean { + if (/\b(cat|sed|awk|head|tail|less|more|bat|nl)\b/.test(command)) { + return true; + } + + if (/\b(rg|grep)\b/.test(command)) { + return !/\b--files\b/.test(command); + } + + return false; +} + +function evidenceForRequiredFile(call: ToolCall, expectedFile: string): string | undefined { + const directPath = readPathFromCall(call); + if (directPath && pathMatches(directPath, expectedFile)) { + return `Read tool loaded ${directPath}`; + } + + if (call.tool.toLowerCase() !== 'bash') { + return undefined; + } + + const command = bashCommand(call); + if (!command || !commandReadsFiles(command)) { + return undefined; + } + + const output = stringify(call.output); + if (textMentionsPath(command, expectedFile) || textMentionsPath(output, expectedFile)) { + const compactCommand = command.replace(/\s+/g, ' ').trim(); + return `Bash command inspected ${expectedFile}: ${compactCommand.slice(0, 160)}`; + } + + return undefined; +} + +export default defineCodeGrader(({ config, messages, trace }) => { + const requiredFiles = stringArray(config?.requiredFiles ?? config?.required_files); + + if (requiredFiles.length === 0) { + return { + score: 0, + assertions: [{ text: 'required_files config is missing or empty', passed: false }], + }; + } + + const toolCalls = allToolCalls(messages, trace); + const assertions: Assertion[] = requiredFiles.map((file) => { + const evidence = toolCalls + .map((call) => evidenceForRequiredFile(call, file)) + .find((item): item is string => typeof item === 'string'); + + return evidence + ? { text: `Read required file: ${file}`, passed: true, evidence } + : { + text: `Read required file: ${file}`, + passed: false, + evidence: + toolCalls.length > 0 + ? `Observed ${toolCalls.length} tool call(s), but none read ${file}` + : 'No tool calls recorded', + }; + }); + + const passed = assertions.filter((assertion) => assertion.passed).length; + return { + score: passed / assertions.length, + assertions, + }; +});