From 4872b7fb8d35eadb46be280b9cefd85c0fff805d Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 21 Jun 2026 05:15:52 +0200 Subject: [PATCH 01/21] feat(results): add artifact pointer contract --- apps/cli/src/commands/results/manifest.ts | 8 +- .../commands/eval/artifact-writer.test.ts | 60 ++++++++- apps/cli/test/commands/results/shared.test.ts | 49 ++++++++ .../docs/docs/evaluation/running-evals.mdx | 15 ++- .../src/content/docs/docs/tools/results.mdx | 2 +- .../evaluation/result-artifact-contract.ts | 103 +++++++++++++++ .../core/src/evaluation/result-row-schema.ts | 1 + packages/core/src/evaluation/results-repo.ts | 3 +- packages/core/src/evaluation/run-artifacts.ts | 117 +++++++++++++++++- packages/core/src/index.ts | 1 + .../core/test/evaluation/results-repo.test.ts | 7 ++ 11 files changed, 350 insertions(+), 16 deletions(-) create mode 100644 packages/core/src/evaluation/result-artifact-contract.ts diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts index 65044552e..a5d389307 100644 --- a/apps/cli/src/commands/results/manifest.ts +++ b/apps/cli/src/commands/results/manifest.ts @@ -3,6 +3,7 @@ import path from 'node:path'; import { type EvaluationResult, + type ResultArtifactPointersWire, type TraceSummary, type TranscriptJsonLine, buildTraceFromMessages, @@ -43,6 +44,7 @@ export interface ResultManifestRecord { readonly output_path?: string; readonly answer_path?: string; readonly transcript_path?: string; + readonly artifact_pointers?: ResultArtifactPointersWire; readonly response_path?: string; readonly artifact_dir?: string; readonly task_dir?: string; @@ -114,6 +116,10 @@ function readOptionalJson(baseDir: string, relativePath: string | undefined): } } +function resolveTranscriptPath(record: ResultManifestRecord): string | undefined { + return record.transcript_path ?? record.artifact_pointers?.transcript?.path; +} + function hydrateInput( baseDir: string, record: ResultManifestRecord, @@ -143,7 +149,7 @@ function hydrateOutput( } function hydrateTrace(baseDir: string, record: ResultManifestRecord): EvaluationResult['trace'] { - const transcriptText = readOptionalText(baseDir, record.transcript_path); + const transcriptText = readOptionalText(baseDir, resolveTranscriptPath(record)); if (transcriptText) { try { return traceFromTranscriptJsonLines(parseJsonlLines(transcriptText)); diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts index 14ab4dfdd..07867988f 100644 --- a/apps/cli/test/commands/eval/artifact-writer.test.ts +++ b/apps/cli/test/commands/eval/artifact-writer.test.ts @@ -1,11 +1,19 @@ import { afterEach, beforeEach, describe, expect, it } from 'bun:test'; +import { createHash } from 'node:crypto'; import { mkdir, readFile, readdir, rm, writeFile } from 'node:fs/promises'; import path from 'node:path'; import { + AGENTV_RESULTS_ARTIFACTS_REF, + CANONICAL_TRACE_ARTIFACT_PATH, + CANONICAL_TRANSCRIPT_ARTIFACT_PATH, + EXECUTION_TRACE_SCHEMA_VERSION, type EvalTest, type EvaluationResult, type GraderResult, + TRACE_JSON_MEDIA_TYPE, + TRANSCRIPT_JSONL_MEDIA_TYPE, + TRANSCRIPT_SCHEMA_VERSION, TraceEnvelopeWireSchema, buildTraceFromMessages, fromTraceEnvelopeWire, @@ -72,6 +80,10 @@ function makeEvaluatorResult(overrides: Partial = {}): GraderResul } as GraderResult; } +function sha256Hex(content: Buffer): string { + return createHash('sha256').update(content).digest('hex'); +} + // --------------------------------------------------------------------------- // Grading artifact // --------------------------------------------------------------------------- @@ -828,9 +840,8 @@ describe('writeArtifactsFromResults', () => { await writeArtifactsFromResults(results, testDir); - const transcriptLines = ( - await readFile(path.join(testDir, 'transcript-case', 'outputs', 'transcript.jsonl'), 'utf8') - ) + const transcriptPath = path.join(testDir, 'transcript-case', 'outputs', 'transcript.jsonl'); + const transcriptLines = (await readFile(transcriptPath, 'utf8')) .trim() .split('\n') .map((line) => JSON.parse(line)); @@ -911,7 +922,8 @@ describe('writeArtifactsFromResults', () => { expect(transcriptLines[1]).not.toHaveProperty('providerSessionId'); expect(envelope.schema_version).toBe('agentv.trace.v1'); expect(envelope.artifact_id).toMatch(/^execution-trace-/); - expect(envelope.artifacts.trace_path).toBe('outputs/trace.json'); + expect(envelope.artifacts.trace_path).toBe(CANONICAL_TRACE_ARTIFACT_PATH); + expect(envelope.artifacts.transcript_path).toBe(CANONICAL_TRANSCRIPT_ARTIFACT_PATH); expect(envelope.artifacts).not.toHaveProperty('execution_trace_path'); expect(envelope.eval.test_id).toBe('transcript-case'); expect(envelope.trace.spans.map((span) => span.attributes['gen_ai.operation.name'])).toEqual([ @@ -924,7 +936,38 @@ describe('writeArtifactsFromResults', () => { (await readFile(path.join(testDir, 'index.jsonl'), 'utf8')).trim(), ); expect(indexLine.transcript_path).toBe('transcript-case/outputs/transcript.jsonl'); + expect(indexLine.transcript_path.endsWith(CANONICAL_TRANSCRIPT_ARTIFACT_PATH)).toBe(true); expect(indexLine).not.toHaveProperty('trace_path'); + + const traceContent = await readFile( + path.join(testDir, 'transcript-case', 'outputs', 'trace.json'), + ); + const transcriptContent = await readFile(transcriptPath); + const traceSha = sha256Hex(traceContent); + const transcriptSha = sha256Hex(transcriptContent); + + expect(indexLine.artifact_pointers.trace).toMatchObject({ + ref: AGENTV_RESULTS_ARTIFACTS_REF, + key: 'traces/transcript-case/outputs/trace.json', + object_version: `sha256:${traceSha}`, + path: 'transcript-case/outputs/trace.json', + sha256: traceSha, + size: traceContent.byteLength, + schema_version: EXECUTION_TRACE_SCHEMA_VERSION, + media_type: TRACE_JSON_MEDIA_TYPE, + family: 'traces', + }); + expect(indexLine.artifact_pointers.transcript).toMatchObject({ + ref: AGENTV_RESULTS_ARTIFACTS_REF, + key: 'transcripts/transcript-case/outputs/transcript.jsonl', + object_version: `sha256:${transcriptSha}`, + path: 'transcript-case/outputs/transcript.jsonl', + sha256: transcriptSha, + size: transcriptContent.byteLength, + schema_version: TRANSCRIPT_SCHEMA_VERSION, + media_type: TRANSCRIPT_JSONL_MEDIA_TYPE, + family: 'transcripts', + }); }); it('omits per-test transcript links when the execution trace has no transcript rows', async () => { @@ -945,6 +988,15 @@ describe('writeArtifactsFromResults', () => { (await readFile(path.join(testDir, 'index.jsonl'), 'utf8')).trim(), ); expect(indexLine).not.toHaveProperty('transcript_path'); + expect(indexLine.artifact_pointers.trace).toMatchObject({ + ref: AGENTV_RESULTS_ARTIFACTS_REF, + key: 'traces/no-transcript-case/outputs/trace.json', + path: 'no-transcript-case/outputs/trace.json', + schema_version: EXECUTION_TRACE_SCHEMA_VERSION, + media_type: TRACE_JSON_MEDIA_TYPE, + family: 'traces', + }); + expect(indexLine.artifact_pointers).not.toHaveProperty('transcript'); const envelope = TraceEnvelopeWireSchema.parse( JSON.parse( diff --git a/apps/cli/test/commands/results/shared.test.ts b/apps/cli/test/commands/results/shared.test.ts index 12f64f61e..e9d2f33a0 100644 --- a/apps/cli/test/commands/results/shared.test.ts +++ b/apps/cli/test/commands/results/shared.test.ts @@ -79,6 +79,55 @@ describe('results shared source resolution', () => { expect(results[0].trace.toolCalls).toEqual({ rg: 1 }); }); + it('hydrates transcripts from artifact pointers when transcript_path is absent', () => { + const runDir = path.join(tempDir, '.agentv', 'results', 'runs', '2026-03-25T10-00-00-000Z'); + const transcriptRelativePath = 'pointer-case/outputs/transcript.jsonl'; + mkdirSync(path.join(runDir, 'pointer-case', 'outputs'), { recursive: true }); + writeFileSync( + path.join(runDir, transcriptRelativePath), + `${JSON.stringify({ + schema_version: 'agentv.transcript.v1', + test_id: 'pointer-case', + target: 'codex', + message_index: 0, + role: 'assistant', + content: 'Loaded from pointer', + source: { provider: 'codex', session_id: 'session-pointer' }, + })}\n`, + ); + const indexPath = path.join(runDir, 'index.jsonl'); + writeFileSync( + indexPath, + `${JSON.stringify({ + timestamp: '2026-03-25T10:00:00.000Z', + test_id: 'pointer-case', + target: 'codex', + score: 1, + grading_path: 'pointer-case/grading.json', + timing_path: 'pointer-case/timing.json', + artifact_pointers: { + transcript: { + ref: 'agentv/results/v1/artifacts', + key: 'transcripts/pointer-case/outputs/transcript.jsonl', + object_version: 'sha256:test', + path: transcriptRelativePath, + sha256: 'test', + size: 1, + schema_version: 'agentv.transcript.v1', + media_type: 'application/x-ndjson', + family: 'transcripts', + }, + }, + })}\n`, + ); + + const results = loadManifestResults(indexPath); + + expect(results).toHaveLength(1); + expect(results[0].trace.messages[0]?.content).toBe('Loaded from pointer'); + expect(results[0].trace.messages[0]?.role).toBe('assistant'); + }); + it('rejects eval-case-only rows with migration guidance', () => { const runDir = path.join(tempDir, '.agentv', 'results', 'runs', '2026-03-25T10-00-00-000Z'); mkdirSync(runDir, { recursive: true }); diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx index e3fc7803e..f9676b09d 100644 --- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx +++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx @@ -439,14 +439,21 @@ Each result row's `artifact_dir` can include both `outputs/trace.json` and artifact; use each index row's `transcript_path` to find the per-result transcript. +Rows also include `artifact_pointers` for AgentV-owned artifact storage. Pointer +entries such as `artifact_pointers.trace` and `artifact_pointers.transcript` +carry the storage `ref`, artifact `key`, canonical run-relative `path`, +`object_version`, `sha256`, `size`, `schema_version`, and `media_type` so +viewers and exports can migrate from git refs to object storage without changing +the run record contract. + `outputs/trace.json` is the full-fidelity `agentv.trace.v1` sidecar. It stores the canonical span graph, source metadata, capture/redaction policy, conversion warnings, score provenance, and opaque evidence references. -`outputs/transcript.jsonl` is a derived compatibility artifact for reading and -replay. It uses provider-neutral `agentv.transcript.v1` rows with stable -top-level fields for message order, role/content, tool calls and paired results, -timing, token usage, cost, source metadata, capture state, and trace pointers. +`outputs/transcript.jsonl` is the canonical AgentV transcript/timeline artifact. +It uses provider-neutral `agentv.transcript.v1` rows with stable top-level fields +for message order, role/content, tool calls and paired results, timing, token +usage, cost, source metadata, capture state, and trace pointers. Provider-native payloads can appear only inside opaque nested fields such as `metadata`, `source.metadata`, tool `input`, or tool `output`. diff --git a/apps/web/src/content/docs/docs/tools/results.mdx b/apps/web/src/content/docs/docs/tools/results.mdx index 4837595aa..50254dfc8 100644 --- a/apps/web/src/content/docs/docs/tools/results.mdx +++ b/apps/web/src/content/docs/docs/tools/results.mdx @@ -127,7 +127,7 @@ The CLI contract is deliberately narrow: `agentv results` manages local result a Use these supported remote workflows instead: -- **Automatic publishing:** configure `projects[].results` or top-level `results`; new `agentv eval` and `agentv pipeline bench` runs publish completed artifacts after the run completes. Use `repo_path: .` with `branch: agentv/results/v1` to store results on a dedicated branch of the source repo. `repo_path` means an existing local Git checkout, distinct from `workspace.repos[].repo`, which is a portable repository identity. Set `sync.auto_push: true` to push after publish, or `sync.require_push: true` in CI to fail when that push fails. While an eval is still running, [WIP checkpoints](/docs/tools/wip-checkpoints/) can keep partial run output durable on `agentv/wip/...` branches when auto-push is enabled. +- **Automatic publishing:** configure `projects[].results` or top-level `results`; new `agentv eval` and `agentv pipeline bench` runs publish completed artifacts after the run completes. Use `repo_path: .` with `branch: agentv/results/v1` to store primary result records on a dedicated branch of the source repo. AgentV reserves `agentv/results/v1` for primary results, `agentv/results/v1/artifacts` for heavy artifact payloads, and `agentv/results/v1/oplog` for mutable run/result operations. `repo_path` means an existing local Git checkout, distinct from `workspace.repos[].repo`, which is a portable repository identity. Set `sync.auto_push: true` to push after publish, or `sync.require_push: true` in CI to fail when that push fails. While an eval is still running, [WIP checkpoints](/docs/tools/wip-checkpoints/) can keep partial run output durable on `agentv/wip/...` branches when auto-push is enabled. - **Manual Dashboard sync:** run `agentv dashboard`, open the project, and use **Sync Project**. - **Manual API sync:** while Dashboard is running, call `GET /api/projects/:projectId/remote/status` or `POST /api/projects/:projectId/remote/sync` for project-scoped automation. Single-project sessions also expose `GET /api/remote/status` and `POST /api/remote/sync`. - **Git escape hatch:** for advanced recovery, inspect or repair the configured `projects[].results.path` clone with `git` directly, then sync again. diff --git a/packages/core/src/evaluation/result-artifact-contract.ts b/packages/core/src/evaluation/result-artifact-contract.ts new file mode 100644 index 000000000..f91c1f8bb --- /dev/null +++ b/packages/core/src/evaluation/result-artifact-contract.ts @@ -0,0 +1,103 @@ +/** + * AgentV-owned result artifact contract. + * + * This module centralizes the git refs and portable pointer shapes used by run + * records. Local run workspaces still write their files under the existing + * per-result artifact directories; these pointers describe where those same + * AgentV-owned artifacts belong when projected to a results ref, sidecar ref, + * or object store. + */ + +export const AGENTV_RESULTS_PRIMARY_REF = 'agentv/results/v1' as const; +export const AGENTV_RESULTS_ARTIFACTS_REF = 'agentv/results/v1/artifacts' as const; +export const AGENTV_RESULTS_OPLOG_REF = 'agentv/results/v1/oplog' as const; + +export const AGENTV_RESULTS_REFS = { + primary: AGENTV_RESULTS_PRIMARY_REF, + artifacts: AGENTV_RESULTS_ARTIFACTS_REF, + oplog: AGENTV_RESULTS_OPLOG_REF, +} as const; + +export const CANONICAL_TRACE_ARTIFACT_PATH = 'outputs/trace.json' as const; +export const CANONICAL_TRANSCRIPT_ARTIFACT_PATH = 'outputs/transcript.jsonl' as const; + +export const TRANSCRIPT_SCHEMA_VERSION = 'agentv.transcript.v1' as const; +export const TRANSCRIPT_JSONL_MEDIA_TYPE = 'application/x-ndjson' as const; +export const TRACE_JSON_MEDIA_TYPE = 'application/vnd.agentv.trace.v1+json' as const; + +export type AgentVResultsRefName = (typeof AGENTV_RESULTS_REFS)[keyof typeof AGENTV_RESULTS_REFS]; + +export type ResultArtifactFamily = 'traces' | 'transcripts' | 'outputs' | 'raw-logs' | 'screenshots'; + +export interface ResultArtifactPointer { + readonly ref: AgentVResultsRefName | string; + readonly key: string; + readonly objectVersion: string; + readonly path: string; + readonly sha256: string; + readonly size: number; + readonly schemaVersion: string; + readonly mediaType: string; + readonly family?: ResultArtifactFamily; +} + +export interface ResultArtifactPointerWire { + readonly ref: AgentVResultsRefName | string; + readonly key: string; + readonly object_version: string; + readonly path: string; + readonly sha256: string; + readonly size: number; + readonly schema_version: string; + readonly media_type: string; + readonly family?: ResultArtifactFamily; +} + +export type TranscriptArtifactPointer = ResultArtifactPointer & { + readonly schemaVersion: typeof TRANSCRIPT_SCHEMA_VERSION; + readonly mediaType: typeof TRANSCRIPT_JSONL_MEDIA_TYPE; + readonly family: 'transcripts'; +}; + +export type TranscriptArtifactPointerWire = ResultArtifactPointerWire & { + readonly schema_version: typeof TRANSCRIPT_SCHEMA_VERSION; + readonly media_type: typeof TRANSCRIPT_JSONL_MEDIA_TYPE; + readonly family: 'transcripts'; +}; + +export interface ResultArtifactPointersWire { + readonly trace?: ResultArtifactPointerWire; + readonly transcript?: TranscriptArtifactPointerWire; +} + +export function toResultArtifactPointerWire( + pointer: ResultArtifactPointer, +): ResultArtifactPointerWire { + return { + ref: pointer.ref, + key: pointer.key, + object_version: pointer.objectVersion, + path: pointer.path, + sha256: pointer.sha256, + size: pointer.size, + schema_version: pointer.schemaVersion, + media_type: pointer.mediaType, + family: pointer.family, + }; +} + +export function fromResultArtifactPointerWire( + pointer: ResultArtifactPointerWire, +): ResultArtifactPointer { + return { + ref: pointer.ref, + key: pointer.key, + objectVersion: pointer.object_version, + path: pointer.path, + sha256: pointer.sha256, + size: pointer.size, + schemaVersion: pointer.schema_version, + mediaType: pointer.media_type, + family: pointer.family, + }; +} diff --git a/packages/core/src/evaluation/result-row-schema.ts b/packages/core/src/evaluation/result-row-schema.ts index fff7ef0e0..5ec17dec1 100644 --- a/packages/core/src/evaluation/result-row-schema.ts +++ b/packages/core/src/evaluation/result-row-schema.ts @@ -20,6 +20,7 @@ const MIGRATION_GUIDANCE = const RESULT_ROW_ALIASES = { answerPath: 'answer_path', + artifactPointers: 'artifact_pointers', artifactDir: 'artifact_dir', conversationId: 'conversation_id', costUsd: 'cost_usd', diff --git a/packages/core/src/evaluation/results-repo.ts b/packages/core/src/evaluation/results-repo.ts index bbe511487..b08bff3af 100644 --- a/packages/core/src/evaluation/results-repo.ts +++ b/packages/core/src/evaluation/results-repo.ts @@ -14,6 +14,7 @@ import path from 'node:path'; import { promisify } from 'node:util'; import { getAgentvDataDir } from '../paths.js'; +import { AGENTV_RESULTS_PRIMARY_REF } from './result-artifact-contract.js'; import type { ResultsConfig } from './loaders/config-loader.js'; const execFileAsync = promisify(execFile); @@ -32,7 +33,7 @@ const RESULTS_REPO_METADATA_DIR = 'metadata'; const RESULTS_REPO_TRACKED_DIRS = [RESULTS_REPO_RUNS_DIR, RESULTS_REPO_METADATA_DIR] as const; const RESULTS_REPO_COMMIT_EMAIL = 'agentv@results-repo'; const RESULTS_REPO_COMMIT_NAME = 'AgentV Results'; -export const DEFAULT_RESULTS_BRANCH = 'agentv/results/v1'; +export const DEFAULT_RESULTS_BRANCH = AGENTV_RESULTS_PRIMARY_REF; const GIT_EMPTY_TREE = '4b825dc642cb6eb9a060e54bf8d69288fbee4904'; // The results branch is a self-rooted orphan whose first commit is a fixed, // byte-identical empty-tree genesis. Pinning the message, identity (see diff --git a/packages/core/src/evaluation/run-artifacts.ts b/packages/core/src/evaluation/run-artifacts.ts index c15fb205d..d3902716f 100644 --- a/packages/core/src/evaluation/run-artifacts.ts +++ b/packages/core/src/evaluation/run-artifacts.ts @@ -7,6 +7,7 @@ * snake_case here so every caller produces the same artifacts. */ +import { createHash } from 'node:crypto'; import { mkdir, readFile, writeFile } from 'node:fs/promises'; import path from 'node:path'; @@ -23,8 +24,22 @@ import { import type { Message } from './providers/types.js'; import { extractLastAssistantContent } from './providers/types.js'; import { normalizeResultRow } from './result-row-schema.js'; +import { + AGENTV_RESULTS_ARTIFACTS_REF, + CANONICAL_TRACE_ARTIFACT_PATH, + CANONICAL_TRANSCRIPT_ARTIFACT_PATH, + TRACE_JSON_MEDIA_TYPE, + TRANSCRIPT_JSONL_MEDIA_TYPE, + TRANSCRIPT_SCHEMA_VERSION, + type ResultArtifactFamily, + type ResultArtifactPointerWire, + type ResultArtifactPointersWire, + type TranscriptArtifactPointerWire, + toResultArtifactPointerWire, +} from './result-artifact-contract.js'; import { type TraceEnvelope, + EXECUTION_TRACE_SCHEMA_VERSION, buildTraceEnvelopeFromEvaluationResult, toTraceEnvelopeWire, traceEnvelopeToTranscriptMessages, @@ -205,6 +220,7 @@ export interface IndexArtifactEntry { readonly output_path?: string; readonly answer_path?: string; readonly transcript_path?: string; + readonly artifact_pointers?: ResultArtifactPointersWire; readonly input_path?: string; readonly response_path?: string; readonly task_dir?: string; @@ -751,10 +767,10 @@ function buildTraceEnvelopeSidecar(params: TraceEnvelopeSidecarParams): TraceEnv source: { path: RESULT_INDEX_FILENAME }, capture: { content: 'full', redactionLevel: 'none', redactedFields: [] }, artifacts: { - trace_path: 'outputs/trace.json', + trace_path: CANONICAL_TRACE_ARTIFACT_PATH, answer_path: params.result.output.length > 0 ? 'outputs/answer.md' : undefined, response_path: params.result.output.length > 0 ? 'outputs/response.md' : undefined, - transcript_path: hasTranscript ? 'outputs/transcript.jsonl' : undefined, + transcript_path: hasTranscript ? CANONICAL_TRANSCRIPT_ARTIFACT_PATH : undefined, }, duplicatePolicy: params.duplicatePolicy, }); @@ -772,6 +788,75 @@ async function writeTraceEnvelopeSidecar( return envelope; } +function buildSidecarArtifactKey( + family: ResultArtifactFamily, + runRelativePath: string, +): string { + return path.posix.join(family, runRelativePath); +} + +async function buildArtifactPointer(params: { + readonly filePath: string; + readonly runRelativePath: string; + readonly family: ResultArtifactFamily; + readonly schemaVersion: string; + readonly mediaType: string; +}): Promise { + const content = await readFile(params.filePath); + const sha256 = createHash('sha256').update(content).digest('hex'); + return toResultArtifactPointerWire({ + ref: AGENTV_RESULTS_ARTIFACTS_REF, + key: buildSidecarArtifactKey(params.family, params.runRelativePath), + objectVersion: `sha256:${sha256}`, + path: params.runRelativePath, + sha256, + size: content.byteLength, + schemaVersion: params.schemaVersion, + mediaType: params.mediaType, + family: params.family, + }); +} + +async function buildTracePointer( + outputDir: string, + tracePath: string, +): Promise { + return buildArtifactPointer({ + filePath: tracePath, + runRelativePath: toRelativeArtifactPath(outputDir, tracePath), + family: 'traces', + schemaVersion: EXECUTION_TRACE_SCHEMA_VERSION, + mediaType: TRACE_JSON_MEDIA_TYPE, + }); +} + +async function buildTranscriptPointer( + outputDir: string, + transcriptPath: string, +): Promise { + const pointer = await buildArtifactPointer({ + filePath: transcriptPath, + runRelativePath: toRelativeArtifactPath(outputDir, transcriptPath), + family: 'transcripts', + schemaVersion: TRANSCRIPT_SCHEMA_VERSION, + mediaType: TRANSCRIPT_JSONL_MEDIA_TYPE, + }); + return pointer as TranscriptArtifactPointerWire; +} + +async function buildArtifactPointers(params: { + readonly outputDir: string; + readonly tracePath: string; + readonly transcriptPath?: string; +}): Promise { + return { + trace: await buildTracePointer(params.outputDir, params.tracePath), + ...(params.transcriptPath + ? { transcript: await buildTranscriptPointer(params.outputDir, params.transcriptPath) } + : {}), + }; +} + export function buildIndexArtifactEntry( result: EvaluationResult, options: { @@ -782,6 +867,7 @@ export function buildIndexArtifactEntry( outputPath?: string; answerPath?: string; transcriptPath?: string; + artifactPointers?: ResultArtifactPointersWire; inputPath?: string; responsePath?: string; extraIndexFields?: AdditionalResultIndexFields; @@ -822,6 +908,7 @@ export function buildIndexArtifactEntry( transcript_path: options.transcriptPath ? toRelativeArtifactPath(options.outputDir, options.transcriptPath) : undefined, + artifact_pointers: options.artifactPointers, input_path: options.inputPath ? toRelativeArtifactPath(options.outputDir, options.inputPath) : undefined, @@ -843,6 +930,7 @@ export function buildResultIndexArtifact( options?: { projectionIdentity?: ProjectionIdentity; duplicatePolicy?: ExportDuplicatePolicy; + artifactPointers?: ResultArtifactPointersWire; }, ): ResultIndexArtifact { const artifactSubdir = buildArtifactSubdir(result); @@ -878,6 +966,7 @@ export function buildResultIndexArtifact( transcript_path: hasTranscript ? path.posix.join(artifactSubdir, 'outputs', 'transcript.jsonl') : undefined, + artifact_pointers: options?.artifactPointers, response_path: hasAnswer ? path.posix.join(artifactSubdir, 'outputs', 'response.md') : undefined, @@ -1263,6 +1352,7 @@ export async function writePerTestArtifacts( await writeFile(path.join(outputsDir, 'answer.md'), result.output, 'utf8'); await writeFile(path.join(outputsDir, 'response.md'), result.output, 'utf8'); } + const tracePath = path.join(outputsDir, 'trace.json'); const envelope = await writeTraceEnvelopeSidecar({ result, outputDir, @@ -1272,9 +1362,17 @@ export async function writePerTestArtifacts( runId: options?.runId, duplicatePolicy, }); - if (hasTranscriptProjection(result, envelope)) { - await writeTranscriptJsonl(path.join(outputsDir, 'transcript.jsonl'), result, envelope); + const transcriptPath = hasTranscriptProjection(result, envelope) + ? path.join(outputsDir, 'transcript.jsonl') + : undefined; + if (transcriptPath) { + await writeTranscriptJsonl(transcriptPath, result, envelope); } + const artifactPointers = await buildArtifactPointers({ + outputDir, + tracePath, + transcriptPath, + }); const extraIndexFields = await collectAdditionalIndexFields( result, @@ -1288,6 +1386,7 @@ export async function writePerTestArtifacts( ...buildResultIndexArtifact(result, extraIndexFields, { projectionIdentity: envelope.projectionIdentity, duplicatePolicy, + artifactPointers, }), experiment: options?.experiment, }); @@ -1351,6 +1450,7 @@ export async function writeArtifactsFromResults( const transcriptPath = hasTranscriptProjection(result, envelope) ? path.join(outputsDir, 'transcript.jsonl') : undefined; + const tracePath = path.join(outputsDir, 'trace.json'); const projectionIdentity = envelope.projectionIdentity; if (!projectionIdentity) { throw new Error(`Result ${result.testId ?? 'unknown'} is missing projection identity`); @@ -1368,6 +1468,7 @@ export async function writeArtifactsFromResults( outputsDir, answerPath, responsePath, + tracePath, envelope, projectionIdentity, transcriptPath, @@ -1417,13 +1518,18 @@ export async function writeArtifactsFromResults( await writeFile(plan.responsePath, result.output, 'utf8'); } await writeFile( - path.join(plan.outputsDir, 'trace.json'), + plan.tracePath, `${JSON.stringify(toTraceEnvelopeWire(envelope), null, 2)}\n`, 'utf8', ); if (plan.transcriptPath) { await writeTranscriptJsonl(plan.transcriptPath, result, envelope); } + const artifactPointers = await buildArtifactPointers({ + outputDir, + tracePath: plan.tracePath, + transcriptPath: plan.transcriptPath, + }); const extraIndexFields = await collectAdditionalIndexFields( result, @@ -1442,6 +1548,7 @@ export async function writeArtifactsFromResults( outputPath: plan.answerPath, answerPath: plan.answerPath, transcriptPath: plan.transcriptPath, + artifactPointers, inputPath: plan.inputPath, responsePath: plan.responsePath, extraIndexFields, diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 0b351ca58..c0e10451d 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -9,6 +9,7 @@ export { ResultRowSchemaError, normalizeResultRow, } from './evaluation/result-row-schema.js'; +export * from './evaluation/result-artifact-contract.js'; export { parseYamlValue } from './evaluation/yaml-loader.js'; export * from './evaluation/yaml-parser.js'; export { diff --git a/packages/core/test/evaluation/results-repo.test.ts b/packages/core/test/evaluation/results-repo.test.ts index c006c4e4c..3d956420d 100644 --- a/packages/core/test/evaluation/results-repo.test.ts +++ b/packages/core/test/evaluation/results-repo.test.ts @@ -5,6 +5,7 @@ import path from 'node:path'; import { afterEach, beforeEach, describe, expect, it } from 'bun:test'; +import { AGENTV_RESULTS_REFS } from '../../src/evaluation/result-artifact-contract.js'; import type { ResultsConfig } from '../../src/evaluation/loaders/config-loader.js'; import { DEFAULT_RESULTS_BRANCH, @@ -332,6 +333,12 @@ describe('results repo write path', () => { ); expect(DEFAULT_RESULTS_BRANCH).toBe('agentv/results/v1'); + expect(DEFAULT_RESULTS_BRANCH).toBe(AGENTV_RESULTS_REFS.primary); + expect(AGENTV_RESULTS_REFS).toEqual({ + primary: 'agentv/results/v1', + artifacts: 'agentv/results/v1/artifacts', + oplog: 'agentv/results/v1/oplog', + }); expect(normalized.branch).toBe('agentv/results/v1'); expect(normalized.repo_path).toBe('/tmp/source-project'); expect(normalized.auto_push).toBe(false); From 0aafea0e05aedfa570e02c28725397f8ca54d708 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 21 Jun 2026 05:16:14 +0200 Subject: [PATCH 02/21] feat(results): expose run oplog state watermark --- .../src/commands/results/remote-metadata.ts | 28 +++- apps/cli/src/commands/results/run-oplog.ts | 141 ++++++++++++++++++ apps/cli/src/commands/results/run-tags.ts | 46 +++++- apps/cli/src/commands/results/serve.ts | 57 ++++++- .../commands/results/remote-metadata.test.ts | 5 + .../test/commands/results/run-oplog.test.ts | 77 ++++++++++ apps/cli/test/commands/results/serve.test.ts | 85 ++++++++++- apps/dashboard/src/lib/types.ts | 21 +++ 8 files changed, 449 insertions(+), 11 deletions(-) create mode 100644 apps/cli/src/commands/results/run-oplog.ts create mode 100644 apps/cli/test/commands/results/run-oplog.test.ts diff --git a/apps/cli/src/commands/results/remote-metadata.ts b/apps/cli/src/commands/results/remote-metadata.ts index 00f57de7b..d9c00a1c3 100644 --- a/apps/cli/src/commands/results/remote-metadata.ts +++ b/apps/cli/src/commands/results/remote-metadata.ts @@ -17,6 +17,13 @@ import { execFileSync } from 'node:child_process'; import { existsSync, mkdirSync, readFileSync, rmSync, writeFileSync } from 'node:fs'; import path from 'node:path'; +import { + type RunOplogWatermark, + buildRunIdFromRelativePath, + createRunTagsSetOperation, + normalizeRunOplogWatermark, + watermarkFromRunOperation, +} from './run-oplog.js'; import { RUN_TAGS_FILENAME, normalizeTags } from './run-tags.js'; const RESULTS_RUNS_DIR = 'runs'; @@ -25,6 +32,7 @@ const REMOTE_METADATA_RUNS_DIR = path.join('metadata', 'runs'); interface TagsFile { readonly tags: string[]; readonly updatedAt?: string; + readonly oplogWatermark?: RunOplogWatermark; } interface RemoteRunMetadataPaths { @@ -48,6 +56,7 @@ export interface RemoteRunTagState { readonly pendingTags?: string[]; readonly dirty: boolean; readonly updatedAt?: string; + readonly oplogWatermark: RunOplogWatermark; readonly metadataPath: string; } @@ -112,9 +121,11 @@ function parseTagsFile(content: string): TagsFile | undefined { const record = parsed as Record; if (!Array.isArray(record.tags)) return undefined; const tags = record.tags.filter((tag): tag is string => typeof tag === 'string'); + const updatedAt = typeof record.updated_at === 'string' ? record.updated_at : undefined; return { tags, - updatedAt: typeof record.updated_at === 'string' ? record.updated_at : undefined, + updatedAt, + oplogWatermark: normalizeRunOplogWatermark(record.oplog_watermark, updatedAt), }; } @@ -189,6 +200,11 @@ function toRemoteRunTagState(context: RemoteRunTagsContext): RemoteRunTagState { const remoteTags = context.baseOverlayTags?.tags ?? context.artifactTags?.tags ?? []; const effectiveTags = context.localOverlayTags?.tags ?? remoteTags; const dirty = !equalTags(effectiveTags, remoteTags); + const watermark = + context.localOverlayTags?.oplogWatermark ?? + context.baseOverlayTags?.oplogWatermark ?? + context.artifactTags?.oplogWatermark ?? + normalizeRunOplogWatermark(undefined); return { tags: effectiveTags, @@ -199,6 +215,7 @@ function toRemoteRunTagState(context: RemoteRunTagsContext): RemoteRunTagState { context.localOverlayTags?.updatedAt ?? context.baseOverlayTags?.updatedAt ?? context.artifactTags?.updatedAt, + oplogWatermark: watermark, metadataPath: context.paths.overlayTagsPath, }; } @@ -240,9 +257,16 @@ export function writeRemoteRunTags( return readRemoteRunTags(repoDir, manifestPath); } + const operation = createRunTagsSetOperation({ + runId: buildRunIdFromRelativePath(context.paths.runRelativePath), + runPath: context.paths.runRelativePath, + tags: cleaned, + actor: { kind: 'dashboard' }, + }); const entry = { tags: cleaned, - updated_at: new Date().toISOString(), + updated_at: operation.authored_at, + oplog_watermark: watermarkFromRunOperation(operation), }; mkdirSync(path.dirname(context.paths.overlayTagsPath), { recursive: true }); writeFileSync(context.paths.overlayTagsPath, `${JSON.stringify(entry, null, 2)}\n`, 'utf8'); diff --git a/apps/cli/src/commands/results/run-oplog.ts b/apps/cli/src/commands/results/run-oplog.ts new file mode 100644 index 000000000..afd295cba --- /dev/null +++ b/apps/cli/src/commands/results/run-oplog.ts @@ -0,0 +1,141 @@ +import { randomUUID } from 'node:crypto'; + +/** + * Minimal run operation-log contract used by Dashboard read models. + * + * The raw oplog storage branch is intentionally not implemented here. This + * module only centralizes the ref name, a small typed operation envelope for + * tag replacement, and the materialized final-state shape that readers consume. + */ + +export const RUN_OPLOG_REF = 'agentv/results/v1/oplog'; +export const RUN_OPERATION_SCHEMA_VERSION = 'agentv.run_operation.v1'; + +export type RunFinalStateLifecycle = 'active' | 'hidden' | 'deleted'; + +export interface RunOplogWatermark { + readonly ref: typeof RUN_OPLOG_REF; + readonly operation_id?: string; + readonly updated_at?: string; +} + +export interface RunFinalState { + readonly lifecycle: RunFinalStateLifecycle; + readonly tags: string[]; +} + +export interface RunReadStateFields { + readonly final_state: RunFinalState; + readonly oplog_watermark: RunOplogWatermark; +} + +export type RunOperationActorKind = 'dashboard' | 'cli' | 'ci' | 'agent' | 'unknown'; + +export interface RunOperationActor { + readonly kind: RunOperationActorKind; + readonly id?: string; +} + +export interface RunOperationSubject { + readonly run_id: string; + readonly run_path?: string; +} + +export interface RunTagsSetOperation { + readonly schema_version: typeof RUN_OPERATION_SCHEMA_VERSION; + readonly operation_id: string; + readonly operation_type: 'run.tags.set'; + readonly authored_at: string; + readonly actor: RunOperationActor; + readonly subject: RunOperationSubject; + readonly payload: { + readonly tags: string[]; + }; +} + +export type RunOperationEnvelope = RunTagsSetOperation; + +export function buildRunIdFromRelativePath(relativeRunPath: string): string { + const segments = relativeRunPath.split(/[\\/]+/).filter(Boolean); + if (segments.length >= 2) { + const experiment = segments.slice(0, -1).join('/'); + const runName = segments.at(-1) ?? relativeRunPath; + return experiment === 'default' ? runName : `${experiment}::${runName}`; + } + return segments[0] ?? relativeRunPath; +} + +export function createRunTagsSetOperation(input: { + readonly runId: string; + readonly runPath?: string; + readonly tags: readonly string[]; + readonly actor?: RunOperationActor; + readonly authoredAt?: string; + readonly operationId?: string; +}): RunTagsSetOperation { + return { + schema_version: RUN_OPERATION_SCHEMA_VERSION, + operation_id: input.operationId ?? randomUUID(), + operation_type: 'run.tags.set', + authored_at: input.authoredAt ?? new Date().toISOString(), + actor: input.actor ?? { kind: 'unknown' }, + subject: { + run_id: input.runId, + ...(input.runPath ? { run_path: input.runPath } : {}), + }, + payload: { + tags: [...input.tags], + }, + }; +} + +export function watermarkFromRunOperation(operation: RunOperationEnvelope): RunOplogWatermark { + return { + ref: RUN_OPLOG_REF, + operation_id: operation.operation_id, + updated_at: operation.authored_at, + }; +} + +export function normalizeRunOplogWatermark( + input: unknown, + fallbackUpdatedAt?: string, +): RunOplogWatermark { + if (input && typeof input === 'object') { + const record = input as Record; + const operationId = record.operation_id; + const updatedAt = record.updated_at; + return { + ref: RUN_OPLOG_REF, + ...(typeof operationId === 'string' && operationId ? { operation_id: operationId } : {}), + ...(typeof updatedAt === 'string' && updatedAt + ? { updated_at: updatedAt } + : fallbackUpdatedAt + ? { updated_at: fallbackUpdatedAt } + : {}), + }; + } + + return { + ref: RUN_OPLOG_REF, + ...(fallbackUpdatedAt ? { updated_at: fallbackUpdatedAt } : {}), + }; +} + +export function materializeRunState(input?: { + readonly lifecycle?: RunFinalStateLifecycle; + readonly tags?: readonly string[]; + readonly watermark?: RunOplogWatermark; + readonly updatedAt?: string; +}): RunReadStateFields { + const tags = [...(input?.tags ?? [])]; + const watermark = input?.watermark ?? normalizeRunOplogWatermark(undefined, input?.updatedAt); + + return { + final_state: { + lifecycle: input?.lifecycle ?? 'active', + tags, + }, + oplog_watermark: watermark, + }; +} diff --git a/apps/cli/src/commands/results/run-tags.ts b/apps/cli/src/commands/results/run-tags.ts index 3714f73d7..4cf18e83d 100644 --- a/apps/cli/src/commands/results/run-tags.ts +++ b/apps/cli/src/commands/results/run-tags.ts @@ -7,13 +7,17 @@ * * Wire format (stored on disk): * ```json - * { "tags": ["baseline", "v2-prompt"], "updated_at": "2026-04-10T00:00:00.000Z" } + * { + * "tags": ["baseline", "v2-prompt"], + * "updated_at": "2026-04-10T00:00:00.000Z", + * "oplog_watermark": { "ref": "agentv/results/v1/oplog" } + * } * ``` * * Used by the Dashboard compare API so users can retroactively tag runs - * without changing the eval YAML or the run manifest itself. This mirrors - * the Langfuse / W&B / GitHub `tags` pattern — a mutable multi-valued - * list of free-form labels that lives alongside the immutable run_id. + * without changing the eval YAML or the run manifest itself. Tags are a + * mutable multi-valued list of free-form labels that lives alongside the + * immutable run_id. * * Validation rules: * - Each tag is 1–60 characters after trimming @@ -29,6 +33,14 @@ import { existsSync, readFileSync, unlinkSync, writeFileSync } from 'node:fs'; import path from 'node:path'; +import { + type RunOplogWatermark, + buildRunIdFromRelativePath, + createRunTagsSetOperation, + normalizeRunOplogWatermark, + watermarkFromRunOperation, +} from './run-oplog.js'; + export const RUN_TAGS_FILENAME = 'tags.json'; /** Maximum number of tags per run. */ @@ -42,6 +54,8 @@ export interface RunTagsFile { tags: string[]; /** ISO-8601 timestamp of last update. */ updated_at: string; + /** Watermark for the operation-log state this materialized tag list reflects. */ + oplog_watermark?: RunOplogWatermark; } /** Resolve the tags sidecar path given a run manifest (index.jsonl) path. */ @@ -49,6 +63,16 @@ export function runTagsPath(manifestPath: string): string { return path.join(path.dirname(manifestPath), RUN_TAGS_FILENAME); } +function inferRunRelativePath(manifestPath: string): string { + const runDir = path.dirname(manifestPath); + const segments = runDir.split(path.sep); + const runsIndex = segments.lastIndexOf('runs'); + if (runsIndex >= 0 && runsIndex < segments.length - 1) { + return segments.slice(runsIndex + 1).join('/'); + } + return path.basename(runDir); +} + /** Read the tags for a run. Returns `undefined` if missing or unreadable. */ export function readRunTags(manifestPath: string): RunTagsFile | undefined { const fp = runTagsPath(manifestPath); @@ -62,9 +86,11 @@ export function readRunTags(manifestPath: string): RunTagsFile | undefined { (t): t is string => typeof t === 'string' && t.trim().length > 0, ); if (tags.length === 0) return undefined; + const updatedAt = typeof record.updated_at === 'string' ? record.updated_at : ''; return { tags, - updated_at: typeof record.updated_at === 'string' ? record.updated_at : '', + updated_at: updatedAt, + oplog_watermark: normalizeRunOplogWatermark(record.oplog_watermark, updatedAt || undefined), }; } catch { return undefined; @@ -81,9 +107,17 @@ export function writeRunTags(manifestPath: string, tags: readonly string[]): Run deleteRunTags(manifestPath); return null; } + const runPath = inferRunRelativePath(manifestPath); + const operation = createRunTagsSetOperation({ + runId: buildRunIdFromRelativePath(runPath), + runPath, + tags: cleaned, + actor: { kind: 'dashboard' }, + }); const entry: RunTagsFile = { tags: cleaned, - updated_at: new Date().toISOString(), + updated_at: operation.authored_at, + oplog_watermark: watermarkFromRunOperation(operation), }; writeFileSync(runTagsPath(manifestPath), `${JSON.stringify(entry, null, 2)}\n`, 'utf8'); return entry; diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 839fc68a5..8350567cc 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -79,6 +79,12 @@ import { setRemoteRunTags, syncRemoteResults, } from './remote.js'; +import { + type RunFinalState, + type RunOplogWatermark, + type RunReadStateFields, + materializeRunState, +} from './run-oplog.js'; import { deleteRunTags, readRunTags, writeRunTags } from './run-tags.js'; import { type StudioConfig, loadStudioConfig, saveStudioConfig } from './studio-config.js'; @@ -316,6 +322,8 @@ interface RunTagFields { readonly remote_tags?: string[]; readonly pending_tags?: string[]; readonly metadata_dirty?: boolean; + readonly final_state: RunFinalState; + readonly oplog_watermark: RunOplogWatermark; } // biome-ignore lint/suspicious/noExplicitAny: Hono Context generic varies by route @@ -340,7 +348,15 @@ async function readRunTagFields( ): Promise { if (meta.source === 'local') { const tagsEntry = readRunTags(meta.path); - return tagsEntry ? { tags: tagsEntry.tags } : {}; + const runState = materializeRunState({ + tags: tagsEntry?.tags ?? [], + watermark: tagsEntry?.oplog_watermark, + updatedAt: tagsEntry?.updated_at || undefined, + }); + return { + ...(tagsEntry ? { tags: tagsEntry.tags } : {}), + ...runState, + }; } const state = await readRemoteRunTagState(searchDir, meta, projectId); @@ -349,6 +365,7 @@ async function readRunTagFields( tags: [], remote_tags: [], metadata_dirty: false, + ...materializeRunState({ tags: [] }), }; } @@ -357,6 +374,11 @@ async function readRunTagFields( remote_tags: state.remoteTags, metadata_dirty: state.dirty, ...(state.dirty && { pending_tags: state.pendingTags ?? state.tags }), + ...materializeRunState({ + tags: state.tags, + watermark: state.oplogWatermark, + updatedAt: state.updatedAt, + }), }; } @@ -366,16 +388,34 @@ function remoteTagMutationResponse(state: { readonly pendingTags?: string[]; readonly dirty: boolean; readonly updatedAt?: string; + readonly oplogWatermark: RunOplogWatermark; }) { return { tags: state.tags, remote_tags: state.remoteTags, metadata_dirty: state.dirty, ...(state.dirty && { pending_tags: state.pendingTags ?? state.tags }), + ...materializeRunState({ + tags: state.tags, + watermark: state.oplogWatermark, + updatedAt: state.updatedAt, + }), updated_at: state.updatedAt ?? new Date().toISOString(), }; } +function localTagMutationResponse(input: { + readonly tags: readonly string[]; + readonly updatedAt?: string; + readonly watermark?: RunOplogWatermark; +}): RunReadStateFields { + return materializeRunState({ + tags: input.tags, + watermark: input.watermark, + updatedAt: input.updatedAt, + }); +} + function remoteMetadataErrorStatus(error: unknown): 400 | 409 { const message = error instanceof Error ? error.message : String(error); if ( @@ -1025,6 +1065,8 @@ async function handleCompare(c: C, { searchDir, agentvDir, projectId }: DataCont remote_tags?: string[]; pending_tags?: string[]; metadata_dirty?: boolean; + final_state: RunFinalState; + oplog_watermark: RunOplogWatermark; source: 'local' | 'remote'; eval_count: number; quality_count: number; @@ -1472,8 +1514,14 @@ async function handleRunTagsPut(c: C, { searchDir, projectId }: DataContext) { } const entry = writeRunTags(meta.path, tags as string[]); + const responseState = localTagMutationResponse({ + tags: entry?.tags ?? [], + updatedAt: entry?.updated_at, + watermark: entry?.oplog_watermark, + }); return c.json({ tags: entry?.tags ?? [], + ...responseState, updated_at: entry?.updated_at ?? new Date().toISOString(), }); } catch (err) { @@ -1495,7 +1543,10 @@ async function handleRunTagsDelete(c: C, { searchDir, projectId }: DataContext) } deleteRunTags(meta.path); - return c.json({ ok: true }); + return c.json({ + ok: true, + ...localTagMutationResponse({ tags: [] }), + }); } catch (err) { return c.json({ error: (err as Error).message }, remoteMetadataErrorStatus(err)); } @@ -1831,6 +1882,8 @@ export function createApp( remote_tags?: string[]; pending_tags?: string[]; metadata_dirty?: boolean; + final_state: RunFinalState; + oplog_watermark: RunOplogWatermark; source: 'local' | 'remote'; project_id: string; project_name: string; diff --git a/apps/cli/test/commands/results/remote-metadata.test.ts b/apps/cli/test/commands/results/remote-metadata.test.ts index fb66e430b..8b47b6f86 100644 --- a/apps/cli/test/commands/results/remote-metadata.test.ts +++ b/apps/cli/test/commands/results/remote-metadata.test.ts @@ -11,6 +11,7 @@ import { readRemoteRunTags, writeRemoteRunTags, } from '../../../src/commands/results/remote-metadata.js'; +import { RUN_OPLOG_REF } from '../../../src/commands/results/run-oplog.js'; const RUN_TIMESTAMP = '2026-06-06T10-00-00-000Z'; @@ -72,6 +73,8 @@ describe('remote metadata tags', () => { expect(state.remoteTags).toEqual(['remote-baseline']); expect(state.pendingTags).toEqual(['pending', 'remote-baseline']); expect(state.dirty).toBe(true); + expect(state.oplogWatermark.ref).toBe(RUN_OPLOG_REF); + expect(state.oplogWatermark.operation_id).toBeString(); expect(state.metadataPath).toContain( path.join('metadata', 'runs', 'default', RUN_TIMESTAMP, 'tags.json'), ); @@ -83,6 +86,7 @@ describe('remote metadata tags', () => { expect(reloaded.tags).toEqual(['pending', 'remote-baseline']); expect(reloaded.pendingTags).toEqual(['pending', 'remote-baseline']); expect(reloaded.dirty).toBe(true); + expect(reloaded.oplogWatermark.operation_id).toBe(state.oplogWatermark.operation_id); }); it('uses committed metadata overlays as the clean remote baseline', () => { @@ -98,6 +102,7 @@ describe('remote metadata tags', () => { expect(reloaded.remoteTags).toEqual(['accepted']); expect(reloaded.pendingTags).toBeUndefined(); expect(reloaded.dirty).toBe(false); + expect(reloaded.oplogWatermark.ref).toBe(RUN_OPLOG_REF); }); it('persists clearing remote tags as an empty pending overlay', () => { diff --git a/apps/cli/test/commands/results/run-oplog.test.ts b/apps/cli/test/commands/results/run-oplog.test.ts new file mode 100644 index 000000000..1ab0ce226 --- /dev/null +++ b/apps/cli/test/commands/results/run-oplog.test.ts @@ -0,0 +1,77 @@ +import { describe, expect, it } from 'bun:test'; + +import { + RUN_OPERATION_SCHEMA_VERSION, + RUN_OPLOG_REF, + buildRunIdFromRelativePath, + createRunTagsSetOperation, + materializeRunState, + watermarkFromRunOperation, +} from '../../../src/commands/results/run-oplog.js'; + +describe('run operation log contract', () => { + it('defines the stable oplog ref', () => { + expect(RUN_OPLOG_REF).toBe('agentv/results/v1/oplog'); + }); + + it('builds a typed tag replacement operation envelope', () => { + const operation = createRunTagsSetOperation({ + runId: 'smoke::2026-06-21T10-00-00-000Z', + runPath: 'smoke/2026-06-21T10-00-00-000Z', + tags: ['baseline', 'reviewed'], + actor: { kind: 'dashboard', id: 'local' }, + authoredAt: '2026-06-21T10:15:00.000Z', + operationId: 'op-123', + }); + + expect(operation).toEqual({ + schema_version: RUN_OPERATION_SCHEMA_VERSION, + operation_id: 'op-123', + operation_type: 'run.tags.set', + authored_at: '2026-06-21T10:15:00.000Z', + actor: { kind: 'dashboard', id: 'local' }, + subject: { + run_id: 'smoke::2026-06-21T10-00-00-000Z', + run_path: 'smoke/2026-06-21T10-00-00-000Z', + }, + payload: { + tags: ['baseline', 'reviewed'], + }, + }); + }); + + it('materializes final run state from tags and an operation watermark', () => { + const operation = createRunTagsSetOperation({ + runId: '2026-06-21T10-00-00-000Z', + tags: ['accepted'], + authoredAt: '2026-06-21T10:15:00.000Z', + operationId: 'op-456', + }); + + expect( + materializeRunState({ + tags: operation.payload.tags, + watermark: watermarkFromRunOperation(operation), + }), + ).toEqual({ + final_state: { + lifecycle: 'active', + tags: ['accepted'], + }, + oplog_watermark: { + ref: RUN_OPLOG_REF, + operation_id: 'op-456', + updated_at: '2026-06-21T10:15:00.000Z', + }, + }); + }); + + it('derives run IDs from results branch paths', () => { + expect(buildRunIdFromRelativePath('default/2026-06-21T10-00-00-000Z')).toBe( + '2026-06-21T10-00-00-000Z', + ); + expect(buildRunIdFromRelativePath('smoke/2026-06-21T10-00-00-000Z')).toBe( + 'smoke::2026-06-21T10-00-00-000Z', + ); + }); +}); diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index 834d041ca..4e7756542 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -8,6 +8,7 @@ import { fileURLToPath } from 'node:url'; import { addProject, saveProjectRegistry } from '@agentv/core'; +import { RUN_OPLOG_REF } from '../../../src/commands/results/run-oplog.js'; import { createApp, loadResults, @@ -934,7 +935,13 @@ describe('serve app', () => { expect(res.status).toBe(200); const data = (await res.json()) as { - runs: Array<{ filename: string; source: string; on_remote: boolean }>; + runs: Array<{ + filename: string; + source: string; + on_remote: boolean; + final_state: { lifecycle: string; tags: string[] }; + oplog_watermark: { ref: string }; + }>; }; expect(data.runs).toHaveLength(1); // A local-only run (no remote configured) is not on the remote branch. @@ -942,6 +949,82 @@ describe('serve app', () => { filename, source: 'local', on_remote: false, + final_state: { + lifecycle: 'active', + tags: [], + }, + oplog_watermark: { + ref: RUN_OPLOG_REF, + }, + }); + }); + + it('exposes materialized final state and oplog watermark for local run tags', async () => { + const runsDir = path.join(tempDir, '.agentv', 'results', 'runs'); + mkdirSync(runsDir, { recursive: true }); + const filename = '2026-03-25T10-00-00-000Z'; + const runDir = path.join(runsDir, filename); + mkdirSync(runDir, { recursive: true }); + writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(RESULT_A)); + writeFileSync( + path.join(runDir, 'tags.json'), + `${JSON.stringify( + { + tags: ['accepted'], + updated_at: '2026-06-21T10:15:00.000Z', + oplog_watermark: { + ref: RUN_OPLOG_REF, + operation_id: 'op-local-tags', + updated_at: '2026-06-21T10:15:00.000Z', + }, + }, + null, + 2, + )}\n`, + ); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + + const listRes = await app.request('/api/runs'); + expect(listRes.status).toBe(200); + const listData = (await listRes.json()) as { + runs: Array<{ + tags: string[]; + final_state: { lifecycle: string; tags: string[] }; + oplog_watermark: { ref: string; operation_id?: string; updated_at?: string }; + }>; + }; + expect(listData.runs[0]).toMatchObject({ + tags: ['accepted'], + final_state: { + lifecycle: 'active', + tags: ['accepted'], + }, + oplog_watermark: { + ref: RUN_OPLOG_REF, + operation_id: 'op-local-tags', + updated_at: '2026-06-21T10:15:00.000Z', + }, + }); + + const detailRes = await app.request(`/api/runs/${encodeURIComponent(filename)}`); + expect(detailRes.status).toBe(200); + const detailData = (await detailRes.json()) as { + tags: string[]; + final_state: { lifecycle: string; tags: string[] }; + oplog_watermark: { ref: string; operation_id?: string; updated_at?: string }; + }; + expect(detailData).toMatchObject({ + tags: ['accepted'], + final_state: { + lifecycle: 'active', + tags: ['accepted'], + }, + oplog_watermark: { + ref: RUN_OPLOG_REF, + operation_id: 'op-local-tags', + updated_at: '2026-06-21T10:15:00.000Z', + }, }); }); diff --git a/apps/dashboard/src/lib/types.ts b/apps/dashboard/src/lib/types.ts index 087836a37..4a8e5065a 100644 --- a/apps/dashboard/src/lib/types.ts +++ b/apps/dashboard/src/lib/types.ts @@ -35,6 +35,10 @@ export interface RunMeta { pending_tags?: string[]; /** True when local editable metadata differs from the fetched remote metadata. */ metadata_dirty?: boolean; + /** Materialized final run state consumed by readers instead of folding raw operations. */ + final_state?: RunFinalState; + /** Operation-log watermark for the materialized final state. */ + oplog_watermark?: RunOplogWatermark; /** * Live execution status. Only present for Dashboard-launched runs that are * still being tracked in-memory — used to render a spinner in RunList @@ -44,6 +48,17 @@ export interface RunMeta { status?: 'starting' | 'running' | 'finished' | 'failed'; } +export interface RunOplogWatermark { + ref: string; + operation_id?: string; + updated_at?: string; +} + +export interface RunFinalState { + lifecycle: 'active' | 'hidden' | 'deleted'; + tags: string[]; +} + export interface RunListResponse { runs: RunMeta[]; next_cursor?: string; @@ -149,6 +164,8 @@ export interface RunDetailResponse { results: EvalResult[]; source: 'local' | 'remote'; source_label?: string; + final_state?: RunFinalState; + oplog_watermark?: RunOplogWatermark; /** Live execution status when this run is still tracked in-memory by Dashboard. */ status?: 'starting' | 'running' | 'finished' | 'failed'; /** Path to the run workspace directory (relative to cwd when inside, otherwise absolute). Local runs only. */ @@ -260,6 +277,8 @@ export interface CompareRunEntry { remote_tags?: string[]; pending_tags?: string[]; metadata_dirty?: boolean; + final_state?: RunFinalState; + oplog_watermark?: RunOplogWatermark; source: 'local' | 'remote'; eval_count: number; quality_count?: number; @@ -283,6 +302,8 @@ export interface RunTagsResponse { remote_tags?: string[]; pending_tags?: string[]; metadata_dirty?: boolean; + final_state?: RunFinalState; + oplog_watermark?: RunOplogWatermark; updated_at: string; } From add584deb771fa2476cdcbac4743ce7088ecb622 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 21 Jun 2026 05:22:06 +0200 Subject: [PATCH 03/21] feat(dashboard): lazy load transcript artifacts --- apps/cli/src/commands/inspect/utils.ts | 4 +- apps/cli/src/commands/results/manifest.ts | 69 +++++- apps/cli/src/commands/results/serve.ts | 209 ++++++++++++++++++- apps/cli/test/commands/results/serve.test.ts | 202 ++++++++++++++++++ apps/dashboard/src/components/EvalDetail.tsx | 154 ++++++++------ apps/dashboard/src/lib/api.ts | 27 +++ apps/dashboard/src/lib/types.ts | 13 ++ 7 files changed, 590 insertions(+), 88 deletions(-) diff --git a/apps/cli/src/commands/inspect/utils.ts b/apps/cli/src/commands/inspect/utils.ts index 6ea549678..574e9de81 100644 --- a/apps/cli/src/commands/inspect/utils.ts +++ b/apps/cli/src/commands/inspect/utils.ts @@ -8,7 +8,7 @@ import { resolveExistingRunPrimaryPath, resolveWorkspaceOrFilePath, } from '../eval/result-layout.js'; -import { loadManifestResults } from '../results/manifest.js'; +import { loadLightweightResults, loadManifestResults } from '../results/manifest.js'; import { ResultRowSchemaError, normalizeResultRow } from '../results/result-row-schema.js'; // ANSI color codes (no dependency needed) @@ -636,7 +636,7 @@ export function listResultFilesFromRunsDir(runsDir: string, limit?: number): Res for (const { filePath, displayName, runId } of limited) { try { const fileStat = statSync(filePath); - const results = loadResultFile(filePath); + const results = loadLightweightResults(filePath); const testCount = results.length; const passCount = results.filter((r) => r.score >= DEFAULT_THRESHOLD).length; diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts index 65044552e..44885fb78 100644 --- a/apps/cli/src/commands/results/manifest.ts +++ b/apps/cli/src/commands/results/manifest.ts @@ -43,6 +43,8 @@ export interface ResultManifestRecord { readonly output_path?: string; readonly answer_path?: string; readonly transcript_path?: string; + readonly transcript?: ArtifactPointer; + readonly artifacts?: ArtifactPointerMap; readonly response_path?: string; readonly artifact_dir?: string; readonly task_dir?: string; @@ -53,6 +55,36 @@ export interface ResultManifestRecord { readonly metadata?: Record; } +export type ArtifactPointer = + | string + | { + readonly path?: unknown; + readonly artifact_path?: unknown; + readonly relative_path?: unknown; + readonly ref?: unknown; + readonly storage?: unknown; + readonly uri?: unknown; + readonly href?: unknown; + readonly [key: string]: unknown; + }; + +export interface ArtifactPointerMap { + readonly transcript_path?: string; + readonly answer_path?: string; + readonly transcript?: ArtifactPointer; + readonly answer?: ArtifactPointer; + readonly [key: string]: unknown; +} + +export interface ManifestHydrationOptions { + /** + * Defaults to true for report/inspect consumers that need a trace projection. + * Dashboard detail routes set this false so transcript bodies are loaded only + * by the explicit transcript artifact endpoint. + */ + readonly hydrateTranscriptTrace?: boolean; +} + function parseJsonlLines(content: string): T[] { return content .split(/\r?\n/) @@ -142,13 +174,19 @@ function hydrateOutput( return responseText.trimEnd(); } -function hydrateTrace(baseDir: string, record: ResultManifestRecord): EvaluationResult['trace'] { - const transcriptText = readOptionalText(baseDir, record.transcript_path); - if (transcriptText) { - try { - return traceFromTranscriptJsonLines(parseJsonlLines(transcriptText)); - } catch { - // Fall through to a minimal trace below. +function hydrateTrace( + baseDir: string, + record: ResultManifestRecord, + options: ManifestHydrationOptions, +): EvaluationResult['trace'] { + if (options.hydrateTranscriptTrace !== false) { + const transcriptText = readOptionalText(baseDir, record.transcript_path); + if (transcriptText) { + try { + return traceFromTranscriptJsonLines(parseJsonlLines(transcriptText)); + } catch { + // Fall through to a minimal trace below. + } } } @@ -163,7 +201,11 @@ function hydrateTrace(baseDir: string, record: ResultManifestRecord): Evaluation }); } -function hydrateManifestRecord(baseDir: string, record: ResultManifestRecord): EvaluationResult { +function hydrateManifestRecord( + baseDir: string, + record: ResultManifestRecord, + options: ManifestHydrationOptions, +): EvaluationResult { const grading = readOptionalJson(baseDir, record.grading_path); const timing = readOptionalJson(baseDir, record.timing_path); const testId = record.test_id ?? 'unknown'; @@ -218,7 +260,7 @@ function hydrateManifestRecord(baseDir: string, record: ResultManifestRecord): E costUsd: record.cost_usd, input: hydrateInput(baseDir, record), output: hydrateOutput(baseDir, record) ?? '', - trace: hydrateTrace(baseDir, record), + trace: hydrateTrace(baseDir, record, options), metadata: record.metadata, } as EvaluationResult; } @@ -235,12 +277,15 @@ export function resolveResultSourcePath(source: string, cwd?: string): string { return resolved; } -export function loadManifestResults(sourceFile: string): EvaluationResult[] { +export function loadManifestResults( + sourceFile: string, + options: ManifestHydrationOptions = {}, +): EvaluationResult[] { const resolvedSourceFile = resolveRunManifestPath(sourceFile); const content = readFileSync(resolvedSourceFile, 'utf8'); const records = parseResultRows(content, resolvedSourceFile); const baseDir = path.dirname(resolvedSourceFile); - return records.map((record) => hydrateManifestRecord(baseDir, record)); + return records.map((record) => hydrateManifestRecord(baseDir, record, options)); } export interface LightweightResultRecord { @@ -253,6 +298,7 @@ export interface LightweightResultRecord { readonly scores?: readonly Record[]; readonly executionStatus?: string; readonly error?: string; + readonly costUsd?: number; readonly timestamp?: string; } @@ -269,6 +315,7 @@ export function loadLightweightResults(sourceFile: string): LightweightResultRec scores: record.scores, executionStatus: record.execution_status, error: record.error, + costUsd: record.cost_usd, timestamp: record.timestamp, })); } diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 839fc68a5..75279e7ab 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -67,7 +67,12 @@ import { } from './combine-run.js'; import { deleteLocalRun } from './delete-run.js'; import { getActiveRunStatus, getActiveRunTarget, registerEvalRoutes } from './eval-runner.js'; -import { loadLightweightResults, loadManifestResults, parseResultManifest } from './manifest.js'; +import { + type ResultManifestRecord, + loadLightweightResults, + loadManifestResults, + parseResultManifest, +} from './manifest.js'; import { type SourcedResultFileMeta, clearRemoteRunTags, @@ -285,6 +290,117 @@ function contentDispositionFilename(filePath: string): string { return path.basename(filePath).replace(/["\\\r\n]/g, '_'); } +function isRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value); +} + +function nonEmptyString(value: unknown): string | undefined { + return typeof value === 'string' && value.trim() ? value.trim() : undefined; +} + +function artifactPointerPath(pointer: unknown): string | undefined { + if (typeof pointer === 'string') return nonEmptyString(pointer); + if (!isRecord(pointer)) return undefined; + return ( + nonEmptyString(pointer.path) ?? + nonEmptyString(pointer.artifact_path) ?? + nonEmptyString(pointer.relative_path) + ); +} + +function artifactPointerDescription(pointer: unknown): string | undefined { + if (typeof pointer === 'string') return pointer; + if (!isRecord(pointer)) return undefined; + const ref = nonEmptyString(pointer.ref); + const storage = nonEmptyString(pointer.storage); + const uri = nonEmptyString(pointer.uri) ?? nonEmptyString(pointer.href); + const pointerPath = artifactPointerPath(pointer); + const parts = [ + ref ? `ref ${ref}` : undefined, + storage ? `storage ${storage}` : undefined, + uri ? `uri ${uri}` : undefined, + pointerPath ? `path ${pointerPath}` : undefined, + ].filter((part): part is string => part !== undefined); + return parts.length > 0 ? parts.join(', ') : undefined; +} + +function artifactPointerRef(pointer: unknown): string | undefined { + return isRecord(pointer) ? nonEmptyString(pointer.ref) : undefined; +} + +interface ResolvedArtifactPointer { + readonly path?: string; + readonly description?: string; + readonly ref?: string; + readonly unsupportedReason?: string; +} + +function resolveRecordArtifactPointer( + record: ResultManifestRecord, + kind: 'transcript' | 'answer', +): ResolvedArtifactPointer { + const directPath = + kind === 'transcript' + ? (record.transcript_path ?? record.artifacts?.transcript_path) + : (record.answer_path ?? record.artifacts?.answer_path ?? record.output_path); + if (directPath) { + return { path: directPath, description: directPath }; + } + + const pointer = + kind === 'transcript' + ? (record.transcript ?? record.artifacts?.transcript) + : record.artifacts?.answer; + const pointerPath = artifactPointerPath(pointer); + const description = artifactPointerDescription(pointer); + const ref = artifactPointerRef(pointer); + if (pointerPath) { + return { path: pointerPath, description, ref }; + } + if (pointer) { + return { + description, + ref, + unsupportedReason: description + ? `${kind} artifact pointer does not include a local path (${description}).` + : `${kind} artifact pointer does not include a local path.`, + }; + } + return {}; +} + +function resolveRunArtifactPath( + baseDir: string, + relativePath: string, +): { absolutePath?: string; error?: string } { + const absolutePath = path.resolve(baseDir, relativePath); + const resolvedBase = path.resolve(baseDir); + if (absolutePath !== resolvedBase && !absolutePath.startsWith(`${resolvedBase}${path.sep}`)) { + return { error: 'Artifact path is outside the run workspace.' }; + } + return { absolutePath }; +} + +function readOptionalRunArtifactText( + baseDir: string, + artifact: ResolvedArtifactPointer, +): string | undefined { + if (!artifact.path) return undefined; + const resolved = resolveRunArtifactPath(baseDir, artifact.path); + if (!resolved.absolutePath) return undefined; + if (!existsSync(resolved.absolutePath) || !statSync(resolved.absolutePath).isFile()) { + return undefined; + } + return readFileSync(resolved.absolutePath, 'utf8'); +} + +function missingTranscriptMessage(): string { + return [ + 'This result does not include canonical outputs/transcript.jsonl metadata.', + 'Dashboard does not parse response.md or markdown transcripts for this view.', + ].join(' '); +} + function stripHeavyFields(results: readonly EvaluationResult[]) { return results.map((r) => { const { requests, trace, ...rest } = r as EvaluationResult & Record; @@ -402,7 +518,7 @@ async function loadManifestResultsForMeta( projectId?: string, ): Promise { await ensureRunReadable(searchDir, meta, projectId); - return loadManifestResults(meta.path); + return loadManifestResults(meta.path, { hydrateTranscriptTrace: false }); } async function loadLightweightResultsForMeta( @@ -824,6 +940,8 @@ async function handleEvalFiles(c: C, { searchDir, projectId }: DataContext) { if (!record) return c.json({ error: 'Eval not found' }, 404); const baseDir = path.dirname(meta.path); + const transcriptArtifact = resolveRecordArtifactPointer(record, 'transcript'); + const answerArtifact = resolveRecordArtifactPointer(record, 'answer'); const knownPaths = [ record.grading_path, record.timing_path, @@ -832,12 +950,14 @@ async function handleEvalFiles(c: C, { searchDir, projectId }: DataContext) { record.response_path, record.answer_path, record.transcript_path, + transcriptArtifact.path, + answerArtifact.path, record.task_dir, record.eval_path, record.targets_path, record.files_path, record.graders_path, - ].filter((p): p is string => !!p); + ].filter((p, index, all): p is string => !!p && all.indexOf(p) === index); if (knownPaths.length === 0) return c.json({ files: [] }); @@ -910,6 +1030,69 @@ async function handleEvalFileContent(c: C, { searchDir, projectId }: DataContext } } +async function handleEvalTranscript(c: C, { searchDir, projectId }: DataContext) { + const filename = c.req.param('filename') ?? ''; + const evalId = c.req.param('evalId'); + const meta = await findRunById(searchDir, filename, projectId); + if (!meta) return c.json({ error: 'Run not found' }, 404); + + try { + const records = await parseManifestForMeta(searchDir, meta, projectId); + const record = records.find((r) => r.test_id === evalId); + if (!record) return c.json({ error: 'Eval not found' }, 404); + + const baseDir = path.dirname(meta.path); + const transcript = resolveRecordArtifactPointer(record, 'transcript'); + const answer = resolveRecordArtifactPointer(record, 'answer'); + + if (!transcript.path) { + return c.json({ + status: transcript.unsupportedReason ? 'unsupported' : 'missing', + message: transcript.unsupportedReason ?? missingTranscriptMessage(), + ...(transcript.description && { pointer: transcript.description }), + }); + } + + const resolvedTranscript = resolveRunArtifactPath(baseDir, transcript.path); + if (!resolvedTranscript.absolutePath) { + return c.json({ + status: 'dangling', + transcript_path: transcript.path, + message: resolvedTranscript.error ?? 'Transcript artifact path could not be resolved.', + ...(transcript.description && { pointer: transcript.description }), + }); + } + + if ( + !existsSync(resolvedTranscript.absolutePath) || + !statSync(resolvedTranscript.absolutePath).isFile() + ) { + const refMessage = transcript.ref ? ` on ${transcript.ref}` : ''; + return c.json({ + status: 'dangling', + transcript_path: transcript.path, + message: `Transcript artifact pointer${refMessage} is present, but ${transcript.path} is not available in this run workspace.`, + ...(transcript.description && { pointer: transcript.description }), + }); + } + + const content = readFileSync(resolvedTranscript.absolutePath, 'utf8'); + const answerContent = readOptionalRunArtifactText(baseDir, answer); + + return c.json({ + status: 'ok', + transcript_path: transcript.path, + content, + language: inferLanguage(transcript.path), + ...(answer.path && { answer_path: answer.path }), + ...(answerContent !== undefined && { answer_content: answerContent }), + ...(transcript.description && { pointer: transcript.description }), + }); + } catch { + return c.json({ error: 'Failed to load transcript artifact' }, 500); + } +} + async function handleExperiments(c: C, { searchDir, agentvDir, projectId }: DataContext) { const { runs: metas } = await listMergedResultFiles(searchDir, undefined, projectId); const { threshold: pass_threshold } = loadStudioConfig(agentvDir); @@ -1946,6 +2129,9 @@ export function createApp( handleCategorySuites(c, defaultCtx), ); app.get('/api/runs/:filename/evals/:evalId', (c) => handleEvalDetail(c, defaultCtx)); + app.get('/api/runs/:filename/evals/:evalId/transcript', (c) => + handleEvalTranscript(c, defaultCtx), + ); app.get('/api/runs/:filename/evals/:evalId/files', (c) => handleEvalFiles(c, defaultCtx)); app.get('/api/runs/:filename/evals/:evalId/files/*', (c) => handleEvalFileContent(c, defaultCtx)); app.get('/api/experiments', (c) => handleExperiments(c, defaultCtx)); @@ -1976,11 +2162,11 @@ export function createApp( let testCount = m.testCount; let executionErrorCount = 0; try { - const loaded = await loadManifestResultsForMeta(searchDir, m, defaultCtx.projectId); - totalCostUsd = loaded.reduce((sum, r) => sum + (r.costUsd ?? 0), 0); - if (loaded.length > 0) { + const records = await loadLightweightResultsForMeta(searchDir, m, defaultCtx.projectId); + totalCostUsd = records.reduce((sum, r) => sum + (r.costUsd ?? 0), 0); + if (records.length > 0) { const qualitySummary = summarizeQualityResults( - loaded, + records, loadStudioConfig(agentvDir).threshold, ); testCount = qualitySummary.totalCount; @@ -2064,6 +2250,9 @@ export function createApp( app.get('/api/projects/:projectId/runs/:filename/evals/:evalId', (c) => withProject(c, handleEvalDetail), ); + app.get('/api/projects/:projectId/runs/:filename/evals/:evalId/transcript', (c) => + withProject(c, handleEvalTranscript), + ); app.get('/api/projects/:projectId/runs/:filename/evals/:evalId/files', (c) => withProject(c, handleEvalFiles), ); @@ -2283,19 +2472,19 @@ export const resultsServeCommand = command({ // project's configured run workspace and fall back to the empty state. if (source) { sourceFile = await resolveSourceFile(source, cwd); - results = loadManifestResults(sourceFile); + results = loadManifestResults(sourceFile, { hydrateTranscriptTrace: false }); } else { // Auto-discover: run cache -> directory scan -> empty state const cache = await loadRunCache(cwd); const cachedFile = cache ? resolveRunCacheFile(cache) : ''; if (cachedFile && existsSync(cachedFile)) { sourceFile = cachedFile; - results = loadManifestResults(cachedFile); + results = loadManifestResults(cachedFile, { hydrateTranscriptTrace: false }); } else { const metas = listResultFiles(cwd, 1); if (metas.length > 0) { sourceFile = metas[0].path; - results = loadManifestResults(metas[0].path); + results = loadManifestResults(metas[0].path, { hydrateTranscriptTrace: false }); } // If no metas, results stays empty — dashboard shows welcome state } diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index 834d041ca..8b36d1d7f 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -2538,6 +2538,208 @@ describe('serve app', () => { }); }); + describe('GET /api/runs/:filename/evals/:evalId/transcript', () => { + it('loads canonical transcript JSONL lazily from the manifest pointer', async () => { + const runsDir = path.join(tempDir, '.agentv', 'results', 'runs', 'with-transcript'); + const runId = 'with-transcript::2026-03-25T10-00-00-000Z'; + const timestampDir = path.join(runsDir, '2026-03-25T10-00-00-000Z'); + const transcriptArtifactPath = 'demo/test-greeting/outputs/transcript.jsonl'; + const answerArtifactPath = 'demo/test-greeting/outputs/answer.md'; + const transcriptPath = path.join(timestampDir, transcriptArtifactPath); + const answerPath = path.join(timestampDir, answerArtifactPath); + const transcriptJsonl = `${JSON.stringify({ + test_id: 'test-greeting', + target: 'gpt-4o', + message_index: 0, + role: 'user', + content: 'Hello', + })}\n`; + + mkdirSync(path.dirname(transcriptPath), { recursive: true }); + writeFileSync(transcriptPath, transcriptJsonl); + writeFileSync(answerPath, 'Hello, Alice!'); + writeFileSync( + path.join(timestampDir, 'index.jsonl'), + toJsonl({ + ...RESULT_A, + experiment: 'with-transcript', + transcript_path: transcriptArtifactPath, + answer_path: answerArtifactPath, + }), + ); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + const res = await app.request( + `/api/runs/${encodeURIComponent(runId)}/evals/test-greeting/transcript`, + ); + + expect(res.status).toBe(200); + const data = (await res.json()) as { + status: string; + transcript_path: string; + content: string; + answer_path: string; + answer_content: string; + }; + expect(data).toMatchObject({ + status: 'ok', + transcript_path: transcriptArtifactPath, + content: transcriptJsonl, + answer_path: answerArtifactPath, + answer_content: 'Hello, Alice!', + }); + }); + + it('loads pointer-shaped transcript metadata when it resolves to a local artifact path', async () => { + const runsDir = path.join(tempDir, '.agentv', 'results', 'runs', 'pointer-transcript'); + const runId = 'pointer-transcript::2026-03-25T11-00-00-000Z'; + const timestampDir = path.join(runsDir, '2026-03-25T11-00-00-000Z'); + const artifactPath = 'demo/test-greeting/outputs/transcript.jsonl'; + const transcriptPath = path.join(timestampDir, artifactPath); + const transcriptJsonl = `${JSON.stringify({ + test_id: 'test-greeting', + target: 'gpt-4o', + message_index: 0, + role: 'assistant', + content: 'Hello', + })}\n`; + + mkdirSync(path.dirname(transcriptPath), { recursive: true }); + writeFileSync(transcriptPath, transcriptJsonl); + writeFileSync( + path.join(timestampDir, 'index.jsonl'), + toJsonl({ + ...RESULT_A, + experiment: 'pointer-transcript', + artifacts: { + transcript: { + ref: 'agentv/results/v1/artifacts', + path: artifactPath, + }, + }, + }), + ); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + const res = await app.request( + `/api/runs/${encodeURIComponent(runId)}/evals/test-greeting/transcript`, + ); + + expect(res.status).toBe(200); + const data = (await res.json()) as { + status: string; + transcript_path: string; + content: string; + pointer: string; + }; + expect(data.status).toBe('ok'); + expect(data.transcript_path).toBe(artifactPath); + expect(data.content).toBe(transcriptJsonl); + expect(data.pointer).toContain('agentv/results/v1/artifacts'); + }); + + it('returns a clear missing state when no transcript pointer is recorded', async () => { + const runId = writeLocalRunArtifact( + tempDir, + 'missing-transcript', + '2026-03-25T12-00-00-000Z', + RESULT_A, + ); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + const res = await app.request( + `/api/runs/${encodeURIComponent(runId)}/evals/test-greeting/transcript`, + ); + + expect(res.status).toBe(200); + const data = (await res.json()) as { status: string; message: string }; + expect(data.status).toBe('missing'); + expect(data.message).toContain('outputs/transcript.jsonl'); + }); + + it('returns a clear dangling state when the transcript pointer cannot be read', async () => { + const runsDir = path.join(tempDir, '.agentv', 'results', 'runs', 'dangling-transcript'); + const runId = 'dangling-transcript::2026-03-25T13-00-00-000Z'; + const timestampDir = path.join(runsDir, '2026-03-25T13-00-00-000Z'); + const artifactPath = 'demo/test-greeting/outputs/transcript.jsonl'; + + mkdirSync(timestampDir, { recursive: true }); + writeFileSync( + path.join(timestampDir, 'index.jsonl'), + toJsonl({ + ...RESULT_A, + experiment: 'dangling-transcript', + transcript_path: artifactPath, + }), + ); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + const res = await app.request( + `/api/runs/${encodeURIComponent(runId)}/evals/test-greeting/transcript`, + ); + + expect(res.status).toBe(200); + const data = (await res.json()) as { + status: string; + transcript_path: string; + message: string; + }; + expect(data.status).toBe('dangling'); + expect(data.transcript_path).toBe(artifactPath); + expect(data.message).toContain('not available'); + }); + + it('does not read transcript bodies for list, detail, or aggregate routes', async () => { + const timestamp = '2026-03-25T14-00-00-000Z'; + const transcriptArtifactPath = 'demo/test-greeting/outputs/transcript.jsonl'; + const runId = writeLocalRunArtifact(tempDir, 'lazy-guard', timestamp, { + ...RESULT_A, + transcript_path: transcriptArtifactPath, + }); + const timestampDir = path.join( + tempDir, + '.agentv', + 'results', + 'runs', + 'lazy-guard', + timestamp, + ); + mkdirSync(path.join(timestampDir, transcriptArtifactPath), { recursive: true }); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + + const listRes = await app.request('/api/runs'); + expect(listRes.status).toBe(200); + const listData = (await listRes.json()) as { + runs: Array<{ filename: string; target?: string }>; + }; + expect(listData.runs.find((run) => run.filename === runId)?.target).toBe('gpt-4o'); + + const detailRes = await app.request(`/api/runs/${encodeURIComponent(runId)}`); + expect(detailRes.status).toBe(200); + const detailData = (await detailRes.json()) as { results: unknown[] }; + expect(detailData.results).toHaveLength(1); + + const compareRes = await app.request('/api/compare'); + expect(compareRes.status).toBe(200); + const compareData = (await compareRes.json()) as { + cells: Array<{ experiment: string; eval_count: number }>; + }; + expect(compareData.cells.find((cell) => cell.experiment === 'lazy-guard')?.eval_count).toBe( + 1, + ); + + const indexRes = await app.request('/api/index'); + expect(indexRes.status).toBe(200); + const indexData = (await indexRes.json()) as { + entries: Array<{ run_filename: string; total_cost_usd: number }>; + }; + expect(indexData.entries.find((entry) => entry.run_filename === runId)?.total_cost_usd).toBe( + RESULT_A.cost_usd, + ); + }); + }); + describe('GET /api/runs/:filename/evals/:evalId/files/*', () => { it('loads file content for experiment-scoped run ids', async () => { const runsDir = path.join(tempDir, '.agentv', 'results', 'runs', 'with-skills'); diff --git a/apps/dashboard/src/components/EvalDetail.tsx b/apps/dashboard/src/components/EvalDetail.tsx index f1369b947..c9c8059a4 100644 --- a/apps/dashboard/src/components/EvalDetail.tsx +++ b/apps/dashboard/src/components/EvalDetail.tsx @@ -14,8 +14,10 @@ import { isPassing, projectEvalFileContentOptions, projectEvalFilesOptions, + projectEvalTranscriptOptions, useEvalFileContent, useEvalFiles, + useEvalTranscript, useStudioConfig, } from '~/lib/api'; import type { @@ -32,12 +34,7 @@ import type { FileNode } from './FileTree'; import { FileTree } from './FileTree'; import { MonacoViewer } from './MonacoViewer'; import { ScoreBar } from './ScoreBar'; -import { - TranscriptTimeline, - findAnswerPath, - findTranscriptPath, - parseTranscriptJsonl, -} from './TranscriptTimeline'; +import { TranscriptTimeline, parseTranscriptJsonl } from './TranscriptTimeline'; interface EvalDetailProps { eval: EvalResult; @@ -457,49 +454,68 @@ function TranscriptTab({ onOpenFile: (path: string) => void; }) { const evalId = result.testId; - const { data: filesData, isLoading: isLoadingFiles } = projectId - ? useQuery(projectEvalFilesOptions(projectId, runId, evalId)) - : useEvalFiles(runId, evalId); - const files = filesData?.files ?? []; - const transcriptPath = findTranscriptPath(files); - const answerPath = findAnswerPath(files); - - const { data: transcriptContentData, isLoading: isLoadingTranscript } = projectId - ? useQuery(projectEvalFileContentOptions(projectId, runId, evalId, transcriptPath ?? '')) - : useEvalFileContent(runId, evalId, transcriptPath ?? ''); - const { data: answerContentData } = projectId - ? useQuery(projectEvalFileContentOptions(projectId, runId, evalId, answerPath ?? '')) - : useEvalFileContent(runId, evalId, answerPath ?? ''); + const { + data: transcriptData, + isLoading: isLoadingTranscript, + error: transcriptError, + } = projectId + ? useQuery(projectEvalTranscriptOptions(projectId, runId, evalId)) + : useEvalTranscript(runId, evalId); + const transcriptPath = transcriptData?.transcript_path; + const answerPath = transcriptData?.answer_path; + const transcriptContent = transcriptData?.status === 'ok' ? (transcriptData.content ?? '') : ''; const parsedTranscript = useMemo( - () => parseTranscriptJsonl(transcriptContentData?.content ?? ''), - [transcriptContentData?.content], + () => parseTranscriptJsonl(transcriptContent), + [transcriptContent], ); - if (isLoadingFiles) { + if (isLoadingTranscript) { return (
- Loading transcript artifacts... + Loading transcript artifact...
); } - if (!transcriptPath) { + if (transcriptError) { + return ( +
+

Transcript could not be loaded

+

{transcriptError.message}

+
+ ); + } + + if (!transcriptData || transcriptData.status === 'missing') { return (

No structured transcript

- This run does not include canonical outputs/transcript.jsonl. Dashboard does - not parse response.md or markdown transcripts for this view. + {transcriptData?.message ?? + 'This run does not include canonical outputs/transcript.jsonl. Dashboard does not parse response.md or markdown transcripts for this view.'}

); } - if (isLoadingTranscript) { + if (transcriptData.status === 'dangling' || transcriptData.status === 'unsupported') { return ( -
- Loading {transcriptPath}... +
+

+ {transcriptData.status === 'dangling' + ? 'Transcript artifact unavailable' + : 'Transcript pointer unsupported'} +

+

+ {transcriptData.message ?? 'The transcript artifact could not be resolved.'} +

+ {transcriptPath ? ( +

{transcriptPath}

+ ) : null} + {transcriptData.pointer ? ( +

{transcriptData.pointer}

+ ) : null}
); } @@ -510,27 +526,31 @@ function TranscriptTab({

Transcript could not be parsed

{parsedTranscript.error}

- - - Open raw JSONL - + {transcriptPath ? ( + <> + + + Open raw JSONL + + + ) : null}
); @@ -550,25 +570,29 @@ function TranscriptTab({ const answerHref = answerPath ? artifactFileContentUrl({ projectId, runId, evalId, filePath: answerPath, raw: true }) : undefined; - const transcriptHref = artifactFileContentUrl({ - projectId, - runId, - evalId, - filePath: transcriptPath, - raw: true, - }); - const transcriptDownloadHref = artifactFileContentUrl({ - projectId, - runId, - evalId, - filePath: transcriptPath, - download: true, - }); + const transcriptHref = transcriptPath + ? artifactFileContentUrl({ + projectId, + runId, + evalId, + filePath: transcriptPath, + raw: true, + }) + : undefined; + const transcriptDownloadHref = transcriptPath + ? artifactFileContentUrl({ + projectId, + runId, + evalId, + filePath: transcriptPath, + download: true, + }) + : undefined; return ( (url: string): Promise { @@ -231,6 +232,17 @@ export function evalFileContentOptions(runId: string, evalId: string, filePath: }); } +export function evalTranscriptOptions(runId: string, evalId: string) { + return queryOptions({ + queryKey: ['runs', runId, 'evals', evalId, 'transcript'], + queryFn: () => + fetchJson( + `/api/runs/${encodeURIComponent(runId)}/evals/${encodeURIComponent(evalId)}/transcript`, + ), + enabled: !!runId && !!evalId, + }); +} + export function runCategoriesOptions(runId: string) { return queryOptions({ queryKey: ['runs', runId, 'categories'], @@ -321,6 +333,10 @@ export function useEvalFileContent(runId: string, evalId: string, filePath: stri return useQuery(evalFileContentOptions(runId, evalId, filePath)); } +export function useEvalTranscript(runId: string, evalId: string) { + return useQuery(evalTranscriptOptions(runId, evalId)); +} + export function useRunCategories(runId: string) { return useQuery(runCategoriesOptions(runId)); } @@ -553,6 +569,17 @@ export function projectEvalFileContentOptions( }); } +export function projectEvalTranscriptOptions(projectId: string, runId: string, evalId: string) { + return queryOptions({ + queryKey: ['projects', projectId, 'runs', runId, 'evals', evalId, 'transcript'], + queryFn: () => + fetchJson( + `${projectApiBase(projectId)}/runs/${encodeURIComponent(runId)}/evals/${encodeURIComponent(evalId)}/transcript`, + ), + enabled: !!projectId && !!runId && !!evalId, + }); +} + export function projectExperimentsOptions(projectId: string) { return queryOptions({ queryKey: ['projects', projectId, 'experiments'], diff --git a/apps/dashboard/src/lib/types.ts b/apps/dashboard/src/lib/types.ts index 087836a37..40d1c7f92 100644 --- a/apps/dashboard/src/lib/types.ts +++ b/apps/dashboard/src/lib/types.ts @@ -176,6 +176,19 @@ export interface EvalDetailResponse { eval: EvalResult; } +export type TranscriptArtifactStatus = 'ok' | 'missing' | 'dangling' | 'unsupported'; + +export interface TranscriptArtifactResponse { + status: TranscriptArtifactStatus; + transcript_path?: string; + answer_path?: string; + answer_content?: string; + content?: string; + language?: string; + message?: string; + pointer?: string; +} + export interface IndexEntry { run_filename: string; display_name?: string; From 8fdf9bd8cc76e2d22c310afc1f8c06dbbcaf5ec9 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 21 Jun 2026 05:22:10 +0200 Subject: [PATCH 04/21] docs(results): specify av-quf storage plan --- ...21-001-feat-av-quf-results-storage-plan.md | 879 ++++++++++++++++++ 1 file changed, 879 insertions(+) create mode 100644 docs/plans/2026-06-21-001-feat-av-quf-results-storage-plan.md diff --git a/docs/plans/2026-06-21-001-feat-av-quf-results-storage-plan.md b/docs/plans/2026-06-21-001-feat-av-quf-results-storage-plan.md new file mode 100644 index 000000000..71a1d7253 --- /dev/null +++ b/docs/plans/2026-06-21-001-feat-av-quf-results-storage-plan.md @@ -0,0 +1,879 @@ +--- +title: "feat: Specify git-native results storage, retention, and oplog" +type: feat +date: 2026-06-21 +bead: av-quf +base: av-vwa.16.10 +--- + +# feat: Specify git-native results storage, retention, and oplog + +## Summary + +Bead `av-quf` should turn the current git-backed results implementation into a +documented storage contract with three backend modes, a single results branch, +one artifact sidecar namespace, retention and compaction rules, a compact +publication export, an append-only mutable-operation log, and an S3-compatible +object-storage tier. + +The canonical AgentV run artifacts stay `benchmark.json`, `index.jsonl`, per-test +grading/timing files, `outputs/trace.json`, and derived transcript artifacts. GitHub, +Backblaze B2, Phoenix, Hugging Face, and Dashboard are projections, viewers, or storage +backends over those artifacts. + +--- + +## Problem Frame + +`packages/core/src/evaluation/results-repo.ts` already implements the first git-native +slice: `agentv/results/v1` is the default results branch, `runs/**` is listed with +`git ls-tree`, `benchmark.json` blobs are read with `git cat-file --batch`, and the +branch root is a deterministic orphan genesis. Current mutable tags live under +`metadata/runs/**`, and heavy transcript sidecars are still written inside each run +workspace by `packages/core/src/evaluation/run-artifacts.ts`. + +The next implementation beads need a precise shared contract before they split work. +The contract must avoid branch proliferation, keep AgentV artifacts canonical, and +define how git, object storage, retention, publication, and mutable operations compose +without creating another hosted results platform inside AgentV. + +--- + +## Scope Boundaries + +### In Scope + +- Define storage backend modes and per-mode listing/index strategies. +- Pin the git-native ref and path layout for `agentv/results/v1`, + `agentv/results/v1/artifacts`, and `agentv/results/v1/oplog`. +- Define retention, compaction, and migration rules for run metadata and heavy artifacts. +- Define compact publication export as a derived artifact over `benchmark.json` and + `index.jsonl`, with no required `eval.txt`. +- Define the mutable operation log and add-wins tag set semantics. +- Define the Backblaze B2 S3-compatible object tier and secret-loading boundary. +- Name concrete files, functions, and tests for dependent implementation beads. + +### Out of Scope + +- Implementing storage backends, S3, oplog, retention, or export code in this bead. +- Adding GitHub issues or tracker runtime state. +- Creating windowed branches, per-run branches, or a hosted Dashboard replacement. +- Making Phoenix, Hugging Face, B2, or GitHub the canonical results model. + +### Deferred to Follow-Up Work + +- Path sharding under `runs/` or artifact prefixes. Only add it after a benchmark with + realistic run counts proves `git ls-tree` or object-store listing is too slow. +- PR-based publishing for human-reviewed result repositories. Machine-generated eval + results should keep direct append commits until a concrete workflow needs review gates. +- A generic non-B2 object-store provider matrix. Start with S3-compatible configuration + narrow enough to support B2 and avoid provider-specific APIs. + +--- + +## Requirements + +### Storage Modes + +- R1. The default mode remains git-native and must work with the current explicit + `repo_path` or `repo_url` results configuration. +- R2. Hybrid mode must keep the run index and metadata in git while moving selected + heavy artifact payloads to object storage. +- R3. Blob-native mode must store run index, metadata, artifacts, and oplog in object + storage without requiring a git checkout, git object database, or git remote. +- R4. Each mode must define its listing/index strategy: git tree listing for git-backed + modes and bucket manifest plus `ListObjectsV2` fallback for blob-native mode. + +### Git Layout And Sync + +- R5. The primary results ref is `agentv/results/v1`. +- R6. Heavy artifact sidecars use the single artifact ref or namespace + `agentv/results/v1/artifacts`, with path prefixes such as `transcripts/`, + `raw-logs/`, and `screenshots/`. +- R7. Mutable operations use the single oplog ref or namespace `agentv/results/v1/oplog`. +- R8. The git-native branch must keep deterministic orphan genesis and must not create + windowed branches or per-run branches. +- R9. Path sharding is not part of v1 unless measurement at realistic scale proves it is + needed. + +### Retention And Publication + +- R10. Retention must distinguish logical pruning from history compaction in git-backed + modes. +- R11. Hybrid and blob-native modes must support object lifecycle policy alignment for + artifact payloads without deleting index metadata prematurely. +- R12. Transcript migration must support transcripts under + `agentv/results/v1/artifacts` while preserving existing logical artifact references. +- R13. Publication export must be compact and derived from `benchmark.json` plus + `index.jsonl`; it must not require an authored or generated `eval.txt`. + +### Mutable Operations + +- R14. Mutable run/result operations must be append-only per actor first. +- R15. Tag mutation semantics start as an add-wins tag set, not direct mutation of + immutable run artifacts. +- R16. Oplog storage location must be defined for all three modes. + +### Object Storage + +- R17. The object-storage tier targets Backblaze B2 through its S3-compatible API. +- R18. The implementation must use a standard S3 SDK/client, not B2-native APIs. +- R19. Credentials must come from environment or config populated by the BWS CLI, and + resolved secret values must not be written into AgentV artifacts, config examples, or + committed docs. + +--- + +## Key Technical Decisions + +- KTD1. Backend mode is a storage concern, not a product model. Use `git-native`, + `hybrid`, and `blob-native` as storage modes while keeping `benchmark.json` and + `index.jsonl` as the artifact contract that readers consume. +- KTD2. Do not overload the existing `results.mode: github` field. Add + `results.storage_mode` with values `git-native`, `hybrid`, and `blob-native`, and + normalize missing `storage_mode` to `git-native`. Put object-store settings under + `results.object_store`. +- KTD3. The git tree remains the index for git-backed modes. `listGitRuns()` should + continue to list `runs/**/benchmark.json` from `agentv/results/v1`; no separate + branch-local `index/runs.jsonl` is introduced. +- KTD4. Use one artifact sidecar namespace named `artifacts`. Do not introduce + `artifact-blobs`, `blobs`, or per-artifact refs. Prefix by artifact class, for example + `transcripts//...`, `raw-logs//...`, and + `screenshots//...`. +- KTD5. Hybrid mode keeps git as the metadata and index authority, while object storage + stores selected heavy payload bytes. Git contains stable artifact locator records with + checksums, sizes, and logical paths so readers can verify fetched payloads. +- KTD6. Blob-native mode mirrors the same logical namespaces in the bucket, but does not + emulate git refs. It owns bucket manifests and per-prefix object listings. +- KTD7. Mutable operations are derived overlays. Existing `metadata/runs/**/tags.json` + is a compatibility read/write surface until oplog materialization replaces direct + overlay writes. +- KTD8. Publication export is a projection. It should read completed run bundles and + emit a compact publishable directory without becoming a new source of truth. +- KTD9. Backblaze B2 is addressed only through S3-compatible endpoints and Signature V4. + The object client should be a standard S3 client configured with endpoint, region, + bucket, and credentials. + +--- + +## High-Level Technical Design + +### Storage Topology + +```mermaid +flowchart TB + Local[Local run workspace .agentv/results/runs] --> Publish[Result publisher] + Publish --> GitIndex[agentv/results/v1 runs metadata] + Publish --> GitArtifacts[agentv/results/v1/artifacts artifact sidecar] + Publish --> Oplog[agentv/results/v1/oplog mutable ops] + Publish --> Bucket[(B2 S3-compatible bucket)] + + GitIndex --> Dashboard[Dashboard and CLI readers] + GitArtifacts --> Resolver[Artifact resolver] + Bucket --> Resolver + Oplog --> Tags[Derived tag set and future mutable views] + Resolver --> Dashboard + + GitIndex -. hybrid metadata .-> Bucket + Bucket -. blob-native manifest .-> Dashboard +``` + +### Mode Matrix + +| Mode | Canonical index/listing | Artifact payloads | Mutable ops | Git dependency | +| --- | --- | --- | --- | --- | +| `git-native` | `git ls-tree -r agentv/results/v1 -- runs/` plus `git cat-file --batch` for `benchmark.json` | `agentv/results/v1/artifacts` stores payload bytes | `agentv/results/v1/oplog` | Required | +| `hybrid` | Same primary git ref as `git-native` | Object storage stores selected payload bytes; git stores locators under the artifact namespace | `agentv/results/v1/oplog` | Required for index/oplog | +| `blob-native` | Bucket manifest under the results namespace, with `ListObjectsV2` fallback by prefix | Object storage stores all payloads | Bucket oplog prefix | None | + +### Logical Namespace Shape + +```text +agentv/results/v1 + runs///benchmark.json + runs///index.jsonl + runs/// + metadata/runs///materialized-tags.json + +agentv/results/v1/artifacts + transcripts////transcript.jsonl + raw-logs////.jsonl + screenshots////.png + +agentv/results/v1/oplog + actors//-.json +``` + +For blob-native mode, these are bucket prefixes rather than git refs. The prefix shape +should stay recognizable so readers can share resolver logic across modes. + +--- + +## Section Specs + +### 1. Storage Backend Abstraction And Modes + +**Decision:** Add a narrow storage abstraction around listing, publishing, +materializing artifacts, resolving artifact bytes, syncing, applying retention, and +reading oplog entries. Keep existing git helpers as the first adapter rather than +rewriting all results code at once. + +**File-level plan:** + +- `packages/core/src/evaluation/results-repo.ts` + - Keep `DEFAULT_RESULTS_BRANCH`, deterministic genesis, `listGitRuns()`, + `materializeGitRun()`, and `directPushResults()` as the git adapter's core. + - Extract or wrap adapter-facing functions instead of renaming them in the first + implementation slice. +- `packages/core/src/evaluation/loaders/config-loader.ts` + - Extend `ResultsConfig` and `parseResultsConfig()` with `storage_mode` and + `object_store`. + - Preserve current `repo_url`, `repo_path`, `branch`, `remote`, `path`, and + `sync` behavior for `git-native`. +- `packages/core/src/projects.ts` + - Add matching project-registry YAML and internal fields if Dashboard project + bindings can configure hybrid/blob-native storage. +- New core files, names to finalize during implementation: + - `packages/core/src/evaluation/results-storage.ts` for shared interfaces. + - `packages/core/src/evaluation/results-git-storage.ts` for the git adapter if + extraction from `results-repo.ts` becomes large. + - `packages/core/src/evaluation/results-object-storage.ts` for S3-compatible + primitives. +- `apps/cli/src/commands/results/remote.ts` + - Route `listMergedResultFiles()`, `getRemoteResultsStatus()`, + `ensureRemoteRunAvailable()`, and `maybeAutoExportRunArtifacts()` through the + normalized adapter. +- `apps/cli/src/commands/results/serve.ts` + - Route remote run listing, file reads, and tag mutations through storage-resolved + metadata rather than assuming a git materialized path exists. + +**Per-mode listing/index strategy:** + +- `git-native`: list `runs/**/benchmark.json` with `git ls-tree`; batch-read + benchmark blobs with `git cat-file --batch`; materialize run details lazily with + `materializeGitRun()`. +- `hybrid`: list from the same git ref and read the same `benchmark.json` blobs. + Artifact locators in `index.jsonl` or sidecar manifests decide whether bytes come + from git artifacts or object storage. +- `blob-native`: read a compact run manifest from bucket storage first. If the + manifest is missing or stale, fall back to `ListObjectsV2` over + `runs/**/benchmark.json`-equivalent objects, rebuild the manifest, and continue. + Use continuation tokens because S3 listing returns a bounded page per request. + +**Test plan:** + +- `packages/core/test/evaluation/results-storage.test.ts` + - Normalizes missing storage mode to `git-native`. + - Rejects incompatible config combinations, such as `blob-native` with `repo_path` + as a hard dependency. + - Proves the adapter interface can list runs in all modes from fixtures. +- `packages/core/test/evaluation/results-repo.test.ts` + - Existing git-native tests must keep passing. + - Add coverage that `git-native` listing remains one `runs/**/benchmark.json` + tree scan, not a committed index file. +- `apps/cli/test/commands/results/serve.test.ts` + - Dashboard `/api/runs` response shape stays stable across adapter-backed sources. + +**Acceptance:** + +- A dependent implementation bead can add a new storage adapter without changing + Dashboard components. +- Existing `results.repo_path` and `results.repo_url` configs still publish and list + runs as `git-native`. +- Blob-native mode has no code path that shells out to `git`. + +### 2. Git-Native Layout + +**Decision:** Keep one primary results branch, one artifact sidecar ref, and one oplog +ref. Do not add windowed or per-run branches. Do not shard paths before measurement. + +**File-level plan:** + +- `packages/core/src/evaluation/results-repo.ts` + - Keep `DEFAULT_RESULTS_BRANCH = 'agentv/results/v1'`. + - Add constants for the artifact and oplog refs: + `agentv/results/v1/artifacts` and `agentv/results/v1/oplog`. + - Extend safe-path staging to include only owned top-level paths on each ref. + - Keep `createResultsGenesisCommit()` and `createOrphanResultsBranch()` behavior + for any new git storage refs so independent clients converge on the same root. + - Keep `commitResultsRunWithTemporaryIndex()` for primary run commits. + - Add artifact-ref and oplog-ref commit helpers only if sharing the temporary-index + machinery remains simple. +- `apps/cli/src/commands/results/remote.ts` + - Keep `getResultsStorageRef()` returning the primary ref for run listing. + - Add resolver access to artifact and oplog refs without changing remote run IDs. +- `packages/core/test/evaluation/results-repo.test.ts` + - Add deterministic genesis tests for the artifact and oplog refs if they are + created by separate helper functions. + - Add tests that two clients publishing to `agentv/results/v1/artifacts` converge + rather than minting divergent orphan roots. + +**Layout rules:** + +- Primary ref `agentv/results/v1`: + - Owns `runs/**` and lightweight materialized metadata. + - Lists runs only through `runs/**/benchmark.json`. +- Artifact ref `agentv/results/v1/artifacts`: + - Owns payload classes under `transcripts/`, `raw-logs/`, and `screenshots/`. + - May store payload bytes in `git-native`. + - May store locator manifests in `hybrid`. +- Oplog ref `agentv/results/v1/oplog`: + - Owns append-only operation records under `actors/**`. + - Is never used for immutable run payloads. + +**Test plan:** + +- Unit test constants and normalized default branch. +- Integration test with a temporary repo that publishes: + - one run to `agentv/results/v1`; + - one transcript payload to `agentv/results/v1/artifacts`; + - one tag operation to `agentv/results/v1/oplog`. +- Assert the source checkout branch does not switch. +- Assert no `agentv/results/v1/` or `agentv/results/run/` refs are created. + +**Acceptance:** + +- `git for-each-ref refs/heads/agentv/results` shows only the v1 primary ref and the + two named sidecar refs for completed-result storage. +- Run listing performance is measured against realistic data before any path sharding + proposal is accepted. + +### 3. Retention, Compaction, And Transcript Migration + +**Decision:** Retention removes live references first; compaction is an explicit +maintenance action because git history and object-store versioning can keep old bytes +after logical deletion. + +**File-level plan:** + +- New core file, likely `packages/core/src/evaluation/results-retention.ts` + - Evaluate retention policy against normalized run metadata. + - Produce a deletion plan for primary run paths, artifact sidecar paths, oplog + materializations, and object-store payloads. + - Keep policy evaluation pure so git and bucket adapters can execute it. +- `packages/core/src/evaluation/results-repo.ts` + - Add git deletion commits for `runs/**`, `metadata/runs/**`, and artifact-ref + prefixes. + - Add optional compaction helpers only after logical pruning exists. +- `packages/core/src/evaluation/run-artifacts.ts` + - Preserve logical `transcript_path` values while supporting external artifact + locators. + - Add optional artifact locator metadata in `index.jsonl` rather than replacing the + existing path fields. +- `apps/cli/src/commands/results/remote.ts` + - Teach `ensureRemoteRunAvailable()` and future artifact resolvers to fetch a + transcript from `agentv/results/v1/artifacts` when the run-local path is a logical + reference. +- `apps/cli/src/commands/results/serve.ts` + - Keep file API responses stable for transcript JSONL, whether bytes are local, + materialized from git, or streamed from object storage. + +**Git/hybrid retention rules:** + +- Logical prune commit: + - Removes selected `runs///**` from `agentv/results/v1`. + - Removes selected artifact paths from `agentv/results/v1/artifacts` or replaces + hybrid locator records with tombstones. + - Appends retention operations to oplog when mutable state is affected. +- Compaction: + - Explicitly rewrites or re-roots storage refs after a backup/export checkpoint. + - Never runs automatically during `agentv eval`. + - Requires remote coordination because old commits and blobs can disappear after + garbage collection. + +**Bucket lifecycle rules:** + +- Hybrid: + - Keep object payloads at least as long as primary git metadata points to them. + - Use object lifecycle for expired payload classes after the git retention plan + removes or tombstones their locators. +- Blob-native: + - Bucket lifecycle can expire artifact payload prefixes independently only when the + bucket manifest and oplog policy mark them expired. + - Keep index manifests longer than payloads when publication or audit needs summary + history without large transcripts. + +**Transcript migration:** + +- Existing runs may have `transcript_path` pointing at + `/outputs/transcript.jsonl`. +- Migration copies transcript bytes to + `agentv/results/v1/artifacts:transcripts////transcript.jsonl` + or the matching object-store key. +- `index.jsonl` keeps `transcript_path` as the logical path and gains optional locator + metadata with `backend`, `ref` or bucket namespace, `path`, `sha256`, and + `size_bytes`. +- Readers resolve the logical path through locator metadata first and fall back to the + run-local file for historical bundles. + +**Test plan:** + +- `packages/core/test/evaluation/results-retention.test.ts` + - Selects old runs by timestamp and keeps protected latest runs. + - Plans transcript sidecar deletion only after primary metadata no longer points to it. + - Produces separate plans for git-native, hybrid, and blob-native modes. +- `packages/core/test/evaluation/run-artifacts.test.ts` + - Verifies optional artifact locator fields are snake_case and do not break + `parseJsonlResults()`. +- `apps/cli/test/commands/results/serve.test.ts` + - Serves a transcript from sidecar/object locator with the same raw/download + behavior as a run-local transcript. + +**Acceptance:** + +- Retention can remove old live runs without breaking listing for retained runs. +- A transcript migrated under `agentv/results/v1/artifacts` remains viewable through + the existing Dashboard file API. +- Compaction cannot run implicitly as a side effect of publish, sync, or Dashboard + polling. + +### 4. Compact Derived Publication Export + +**Decision:** Publication output is a derived export over the canonical run bundle. +It does not require an `eval.txt` artifact, and it does not become the source of truth +for rerun, comparison, grading, or adapter ingestion. + +**File-level plan:** + +- `apps/cli/src/commands/results/export.ts` + - Keep the current run-workspace export path aligned with + `writeArtifactsFromResults()`. + - Add or route to a publication export mode only if the CLI surface stays narrow. +- New CLI/core files if a separate command reads cleaner: + - `apps/cli/src/commands/results/publication.ts` + - `packages/core/src/evaluation/results-publication.ts` +- `packages/core/src/evaluation/run-artifacts.ts` + - Remains the source for `benchmark.json`, `index.jsonl`, and per-test artifact + schemas. +- `apps/web/src/content/docs/docs/tools/results.mdx` + - Document that publication export reads completed run artifacts and does not + require `eval.txt`. + +**Publication contract:** + +- Inputs: + - completed run workspace; + - `index.jsonl` manifest; + - `benchmark.json`; + - optional sidecar-resolved artifact references for selected public payloads. +- Outputs: + - compact `benchmark.json` and `index.jsonl` or a derived `publication.json`; + - optional static assets for selected summaries; + - no required `eval.txt`. +- Privacy: + - Default export excludes raw prompts, tool args/results, transcripts, screenshots, + and raw logs unless the user opts into a payload class. + +**Test plan:** + +- `apps/cli/test/commands/results/export.test.ts` + - Publication export succeeds with only `benchmark.json` and `index.jsonl`. + - Publication export fails clearly when the manifest is not an AgentV result row. + - Payload opt-in includes only selected sidecar files. +- `apps/cli/test/commands/results/report.test.ts` + - Existing single-run HTML report remains unaffected. + +**Acceptance:** + +- A publication artifact can be generated from a run bundle that has no `eval.txt`. +- The exported publication states or embeds enough summary data for readers without + replacing the canonical run bundle. +- External viewers consume publication output as a projection, not as an AgentV run + workspace. + +### 5. Mutable Run/Result Operations Via Append-Only Oplog + +**Decision:** Implement mutable operations as per-actor append-only operation records. +Tags are the first materialized view and use add-wins set semantics. + +**File-level plan:** + +- `apps/cli/src/commands/results/remote-metadata.ts` + - Preserve current `metadata/runs/**/tags.json` behavior as a compatibility layer. + - Add read/write paths that append oplog operations before or instead of writing + materialized overlays. +- New core file, likely `packages/core/src/evaluation/results-oplog.ts` + - Define operation wire records with snake_case fields. + - Define actor id, sequence/nonce, operation id, target run id, operation kind, + payload, created timestamp, and optional causal metadata. + - Implement add-wins tag projection. +- `packages/core/src/evaluation/results-repo.ts` + - Add git append helpers for `agentv/results/v1/oplog`. +- `apps/cli/src/commands/results/serve.ts` + - Route tag set, clear, and read endpoints through oplog projection for remote + runs once the adapter is available. +- `apps/dashboard/src/lib/run-list-actions.ts` and tag-related component tests + - Keep UI semantics stable: tags remain free-form chips with existing limits. + +**Operation shape:** + +```yaml +schema_version: agentv.oplog.v1 +op_id: actor-a/2026-06-21T10-00-00-000Z-01hx +actor_id: actor-a +created_at: "2026-06-21T10:00:00.000Z" +target: + run_id: with-skills::2026-06-17T10-00-00-000Z +kind: tag_add +payload: + tag: release-candidate +``` + +For tag projection, removals record `tag_remove` with the tag value. Concurrent add +and remove resolves to present when the add operation is not causally observed by the +remove. That is the add-wins rule and prevents a stale clear from deleting another +actor's later tag addition. + +**Where oplog lives by mode:** + +- `git-native`: `agentv/results/v1/oplog` git ref, under + `actors//-.json`. +- `hybrid`: same git oplog ref, because git remains the metadata authority. +- `blob-native`: object-store prefix + `oplog/actors//-.json`, with a bucket manifest + for efficient projection rebuilds. + +**Test plan:** + +- `packages/core/test/evaluation/results-oplog.test.ts` + - Projects add-wins tags from add/remove operations. + - Handles duplicate op ids idempotently. + - Keeps operations from different actors without content conflicts. + - Rejects non-snake_case or malformed operation records. +- `apps/cli/test/commands/results/remote-metadata.test.ts` + - Existing overlay tests keep passing. + - New oplog-backed tag write produces the same returned `RemoteRunTagState`. +- `apps/cli/test/commands/results/serve.test.ts` + - Tag API returns effective tags after concurrent actor operations. + +**Acceptance:** + +- A remote tag edit appends an operation and does not rewrite immutable run artifacts. +- Concurrent tag adds from two actors both appear in the materialized tag set. +- Blob-native tag edits work without git. + +### 6. Object-Storage Tier: Backblaze B2 Through S3-Compatible API + +**Decision:** Use Backblaze B2 only through the S3-compatible API with a standard S3 +client. The B2 Native API is out of scope for this storage tier. + +**File-level plan:** + +- `packages/core/package.json` + - Add `@aws-sdk/client-s3` as a direct dependency if object storage code lands in + core. Do not rely on transitive dependencies from provider packages. +- `packages/core/src/evaluation/results-object-storage.ts` + - Create the S3-compatible client from endpoint, region, bucket, prefix, and + environment-provided credentials. + - Implement `put`, `get`, `head`, `delete`, multipart threshold decisions, and + paginated listing. + - Use `ListObjectsV2` continuation tokens for listing. +- `packages/core/src/evaluation/loaders/config-loader.ts` + - Parse object-store config with snake_case fields: + +```yaml +results: + storage_mode: hybrid + repo_path: . + object_store: + provider: s3-compatible + endpoint: ${AGENTV_RESULTS_S3_ENDPOINT} + region: ${AGENTV_RESULTS_S3_REGION} + bucket: ${AGENTV_RESULTS_S3_BUCKET} + prefix: agentv/results/v1 +``` + +- `packages/core/src/evaluation/hooks.ts` + - Reuse existing `before_session` secret-loading support where possible. A project + can run BWS before AgentV commands and inject `AGENTV_RESULTS_S3_*` variables. +- `apps/web/src/content/docs/docs/tools/dashboard.mdx` and + `apps/web/src/content/docs/docs/tools/results.mdx` + - Document that BWS is a local/CI secret source and resolved values must not be + committed. + +**B2 specifics:** + +- Endpoint format is `https://s3..backblazeb2.com`. +- Authentication uses S3 Signature V4. +- Application key id maps to S3 access key id; application key maps to S3 secret key. +- Configure standard S3 endpoint override, region, and credentials. Do not call B2 + Native API endpoints. + +**BWS secret boundary:** + +- Recommended local/CI flow: + - BWS authenticates with `BWS_ACCESS_TOKEN`. + - BWS injects or exports the S3 endpoint, region, bucket, access key id, and secret + access key into environment variables before AgentV runs. + - AgentV config interpolates variable names or reads environment variables directly. +- Never persist resolved BWS values into `benchmark.json`, `index.jsonl`, oplog records, + Dashboard responses, docs examples, or project registry files. + +**Test plan:** + +- `packages/core/test/evaluation/results-object-storage.test.ts` + - Uses a fake S3 client or local test double to verify `PutObject`, `GetObject`, + `HeadObject`, `DeleteObject`, and paginated `ListObjectsV2` behavior. + - Verifies credentials are read from env and are not serialized into manifests. + - Verifies B2 endpoint config is passed as an S3 endpoint override. +- `packages/core/test/evaluation/loaders/config-loader.test.ts` + - Parses object-store config and rejects missing bucket/endpoint for hybrid or + blob-native modes. +- `apps/cli/test/commands/results/serve.test.ts` + - Streams a sidecar artifact from object storage through the existing file API. + +**Acceptance:** + +- Hybrid mode can write a transcript payload to B2 through the S3-compatible client + while listing the run from git. +- Blob-native mode can list runs from bucket metadata without invoking git. +- No code imports a B2-native SDK or calls B2-native API-specific operations. +- No test fixture or docs example contains resolved secret values. + +--- + +## Implementation Units + +### U1. Results Storage Config And Adapter Boundary + +- **Goal:** Add the storage-mode config and adapter interface that later units can use. +- **Requirements:** R1, R2, R3, R4, R19 +- **Dependencies:** None +- **Files:** `packages/core/src/evaluation/loaders/config-loader.ts`, + `packages/core/src/projects.ts`, `packages/core/src/evaluation/results-storage.ts`, + `packages/core/test/evaluation/loaders/config-loader.test.ts`, + `packages/core/test/projects.test.ts`, + `packages/core/test/evaluation/results-storage.test.ts` +- **Approach:** Introduce storage mode without overloading `results.mode: github`. + Normalize missing `storage_mode` to `git-native`, keep current git fields valid, and + define adapter methods for listing, publishing, materializing, artifact reads, + oplog reads, and retention. +- **Patterns to follow:** `normalizeResultsConfig()` in + `packages/core/src/evaluation/results-repo.ts`; `fromYaml()` and `toYaml()` in + `packages/core/src/projects.ts`; snake_case boundary rules in `.agents/conventions.md`. +- **Test scenarios:** + - Given current `repo_path: .` config with no storage mode, normalization returns + `git-native`. + - Given `storage_mode: hybrid`, parser requires valid git configuration and + `object_store`. + - Given `storage_mode: blob-native`, parser accepts `object_store` without + `repo_path` or `repo_url`. + - Given `blob-native` config with no object store, parser rejects it with a clear + warning. + - Given project registry results config with object-store fields, YAML load/save + preserves snake_case on disk and camelCase internally. + - Given legacy `mode: github`, git-native config still works and does not imply + GitHub-only storage. +- **Verification:** Existing git-native publish/list tests still compile against the + normalized config, and new mode tests do not require real network access. + +### U2. Git Refs, Sidecar Constants, And Artifact Locator Support + +- **Goal:** Pin the three git refs and add resolver support for sidecar artifacts. +- **Requirements:** R5, R6, R7, R8, R9, R12 +- **Dependencies:** U1 +- **Files:** `packages/core/src/evaluation/results-repo.ts`, + `packages/core/src/evaluation/run-artifacts.ts`, + `apps/cli/src/commands/results/remote.ts`, + `apps/cli/src/commands/results/serve.ts`, + `packages/core/test/evaluation/results-repo.test.ts`, + `packages/core/test/evaluation/run-artifacts.test.ts`, + `apps/cli/test/commands/results/serve.test.ts` +- **Approach:** Keep `agentv/results/v1` as the listable run ref. Add named constants + for artifact and oplog refs. Add optional artifact locator metadata while preserving + existing logical path fields such as `transcript_path`. +- **Patterns to follow:** Current deterministic genesis functions in `results-repo.ts`; + `buildIndexArtifactEntry()` and `buildResultIndexArtifact()` in `run-artifacts.ts`; + existing transcript file API tests in `serve.test.ts`. +- **Test scenarios:** + - Given a run with a sidecar transcript locator, Dashboard raw file endpoint returns + the same text/plain response as a local transcript file. + - Given no sidecar locator, historical run-local `transcript_path` still resolves. + - Given two clients create an artifact ref, the genesis commit is deterministic. + - Given a publish, no per-run or windowed result refs are created. +- **Verification:** `listGitRuns()` output is unchanged for runs that do not use sidecar + payloads. + +### U3. Retention And Compaction Planner + +- **Goal:** Add retention planning that can prune runs and sidecars without implicit + history compaction. +- **Requirements:** R10, R11, R12 +- **Dependencies:** U1, U2 +- **Files:** `packages/core/src/evaluation/results-retention.ts`, + `packages/core/src/evaluation/results-repo.ts`, + `packages/core/src/evaluation/results-object-storage.ts`, + `packages/core/test/evaluation/results-retention.test.ts`, + `packages/core/test/evaluation/results-repo.test.ts` +- **Approach:** Build a pure planner first. Execution adapters take the plan and create + git deletion commits or bucket deletion batches. Keep compaction as a separate + explicit operation with stronger confirmation and documentation. +- **Patterns to follow:** Safe path filters in `isSafeResultsRepoPath()` and + `existingTrackedResultsDirs()`; project sync's blocked status reporting. +- **Test scenarios:** + - Given runs older than a retention threshold, planner selects primary run paths and + sidecar paths for deletion. + - Given a sidecar transcript still referenced by a retained run, planner keeps it. + - Given object lifecycle policy shorter than metadata retention, planner reports the + mismatch instead of approving deletion. + - Given compaction is not requested, no history rewrite operation is emitted. +- **Verification:** Retention execution can be tested against a temporary git repo and a + fake object store without touching real remotes. + +### U4. Publication Export Projection + +- **Goal:** Add the compact publication export without requiring `eval.txt`. +- **Requirements:** R13 +- **Dependencies:** U1 +- **Files:** `apps/cli/src/commands/results/export.ts`, + `apps/cli/src/commands/results/index.ts`, + `packages/core/src/evaluation/results-publication.ts`, + `apps/web/src/content/docs/docs/tools/results.mdx`, + `apps/cli/test/commands/results/export.test.ts` +- **Approach:** Keep publication export read-only over completed run artifacts. Use + `parseJsonlResults()` and `benchmark.json` metadata as inputs. If a new command is + clearer than another export option, keep it under `agentv results` but document it as + projection-only. +- **Patterns to follow:** `loadExportSource()` and `deriveOutputDir()` in + `apps/cli/src/commands/results/export.ts`; `results report` docs for static output + framing. +- **Test scenarios:** + - Given a run with `index.jsonl` and `benchmark.json`, publication export succeeds + with no `eval.txt`. + - Given an invalid JSONL input that is not an AgentV result row, publication export + fails with the existing result-row schema guidance. + - Given transcript payloads exist, publication export excludes them by default. + - Given payload opt-in for transcripts, publication export includes only selected + sidecar-resolved transcript files. +- **Verification:** The generated publication output can be inspected from disk and does + not modify the source run workspace. + +### U5. Oplog And Add-Wins Tag Projection + +- **Goal:** Replace direct mutable metadata writes with append-only operations and a tag + projection. +- **Requirements:** R14, R15, R16 +- **Dependencies:** U1, U2 +- **Files:** `packages/core/src/evaluation/results-oplog.ts`, + `packages/core/src/evaluation/results-repo.ts`, + `apps/cli/src/commands/results/remote-metadata.ts`, + `apps/cli/src/commands/results/serve.ts`, + `apps/dashboard/src/lib/run-list-actions.ts`, + `packages/core/test/evaluation/results-oplog.test.ts`, + `apps/cli/test/commands/results/remote-metadata.test.ts`, + `apps/cli/test/commands/results/serve.test.ts`, + `apps/dashboard/src/lib/run-list-actions.test.ts` +- **Approach:** Append `tag_add` and `tag_remove` operations per actor, materialize the + effective tag set for read performance, and keep current Dashboard tag UX stable. +- **Patterns to follow:** Current `RemoteRunTagState` shape and `metadata/runs/**` + overlay path handling in `remote-metadata.ts`. +- **Test scenarios:** + - Given two actors add different tags concurrently, both tags are visible. + - Given one actor clears tags while another later adds a tag, the later add wins. + - Given duplicate operation ids, projection is idempotent. + - Given malformed operation JSON, projection reports a warning and skips the record. + - Given blob-native mode, tag operations are stored under bucket oplog prefix and no + git command runs. +- **Verification:** Dashboard tag endpoints return the same response shape as today. + +### U6. S3-Compatible Object Store And B2 Integration + +- **Goal:** Add the object-store tier used by hybrid and blob-native modes. +- **Requirements:** R2, R3, R4, R11, R17, R18, R19 +- **Dependencies:** U1 +- **Files:** `packages/core/package.json`, `bun.lock`, + `packages/core/src/evaluation/results-object-storage.ts`, + `packages/core/src/evaluation/loaders/config-loader.ts`, + `packages/core/test/evaluation/results-object-storage.test.ts`, + `packages/core/test/evaluation/loaders/config-loader.test.ts`, + `apps/web/src/content/docs/docs/tools/dashboard.mdx`, + `apps/web/src/content/docs/docs/tools/results.mdx` +- **Approach:** Add a standard S3 client wrapper with endpoint override support. Keep + B2-specific knowledge in docs/config examples and endpoint validation, not in a + B2-native SDK layer. +- **Patterns to follow:** Existing env interpolation in config loader; `hooks.before_session` + parsing in `packages/core/src/evaluation/hooks.ts`; secret-redaction posture in + task-bundle tests. +- **Test scenarios:** + - Given B2-style endpoint, region, bucket, and env credentials, object client is + configured as S3-compatible. + - Given a paginated object listing, all pages are read using continuation tokens. + - Given missing credentials, error message names variables but not values. + - Given a sidecar upload, the stored locator includes checksum and size but no secret. + - Given blob-native listing, run manifests load from bucket without git. +- **Verification:** Unit tests use a fake S3 client; no real B2 bucket is needed for CI. + +--- + +## System-Wide Impact + +- **Core:** `results-repo.ts` stops being the only remote-results boundary and becomes + the git adapter or wrapped by one. +- **CLI:** `results export`, auto-publish, and Dashboard server routes need adapter + routing but should preserve existing user-facing response shapes. +- **Dashboard:** The UI should not learn storage-specific concepts. It consumes the same + run list, file, and tag API responses. +- **Docs:** Results and Dashboard docs need updated wording because current docs still + imply only git-backed remote results and mention committed `.agentv/results/**` paths + in places that now flatten on-branch to `runs/**`. +- **Secrets:** Object-store credentials must stay in environment or local secret-loading + flows. The implementation must not serialize them into artifacts or Dashboard JSON. + +--- + +## Risks And Mitigations + +| Risk | Mitigation | +| --- | --- | +| Storage abstraction balloons beyond current need | Keep interface methods tied to existing results operations: list, publish, materialize/read artifact, sync, retention, oplog. | +| Hybrid locators break old readers | Keep existing logical path fields and add optional locator metadata. Old bundles keep local files; new readers prefer locators. | +| Git compaction surprises collaborators | Make compaction explicit and separate from retention. Document backup and remote coordination requirements before implementation. | +| Blob-native listing becomes expensive | Use a bucket manifest as the fast path and `ListObjectsV2` as a rebuild/fallback path. Add sharding only after measurement. | +| Secrets leak through config or artifacts | Use env interpolation and BWS injection only; tests assert secret values are absent from manifests, docs fixtures, and errors. | +| B2 differences from AWS S3 leak into core | Use standard S3 client operations and endpoint override. Keep B2-specific docs limited to endpoint/credential mapping. | + +--- + +## Acceptance Checklist + +- [ ] Spec includes one section each for storage modes, git-native layout, + retention/compaction, publication export, oplog, and object storage. +- [ ] All refs are pinned exactly: `agentv/results/v1`, + `agentv/results/v1/artifacts`, and `agentv/results/v1/oplog`. +- [ ] The artifact sidecar is called `artifacts`, not `artifact-blobs` or `blob`. +- [ ] The plan has no windowed or per-run branches. +- [ ] Path sharding is deferred until realistic measurement proves need. +- [ ] AgentV artifacts remain canonical; Dashboard, Hugging Face, Phoenix, B2, and + GitHub are projections/viewers/storage backends. +- [ ] File/function-level implementation guidance names current result repo, remote, + serve, export, artifact-writer, and Dashboard surfaces. +- [ ] Test plan covers core, CLI, Dashboard, and docs-facing behavior. +- [ ] Dependent beads `av-dcs`, `av-kxa`, `av-8un`, `av-dsc`, and `av-thr` can pick + up scoped implementation units without inventing storage decisions. + +--- + +## Sources And Research + +- `docs/plans/git-native-results.md` for the current git-tree-as-index contract. +- `docs/plans/results-branch-layout.md` for flattened `runs/` and `metadata/runs/` + layout. +- `packages/core/src/evaluation/results-repo.ts` for deterministic genesis, + `directPushResults()`, `listGitRuns()`, and `materializeGitRun()`. +- `packages/core/src/evaluation/run-artifacts.ts` and + `apps/cli/src/commands/eval/artifact-writer.ts` for `benchmark.json`, + `index.jsonl`, `outputs/trace.json`, and transcript sidecars. +- `apps/cli/src/commands/results/remote.ts`, + `apps/cli/src/commands/results/remote-metadata.ts`, + `apps/cli/src/commands/results/serve.ts`, and + `apps/cli/src/commands/results/export.ts` for current CLI/Dashboard remote, + metadata, serving, and export behavior. +- `docs/adr/2026-06-18-opik-post-run-export-boundary.md` for the adapter boundary + that keeps AgentV run bundles canonical. +- Backblaze B2 S3-compatible docs: + `https://www.backblaze.com/docs/cloud-storage-call-the-s3-compatible-api` and + `https://www.backblaze.com/apidocs/introduction-to-the-s3-compatible-api`. +- AWS S3 `ListObjectsV2` docs: + `https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html`. +- Bitwarden Secrets Manager CLI docs: + `https://bitwarden.com/help/secrets-manager-cli/`. From 88454063163c747487599d66269229582286940e Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 21 Jun 2026 05:25:59 +0200 Subject: [PATCH 05/21] feat(dashboard): add trace session read model --- .../__fixtures__/trace-session-read-model.ts | 164 ++++++ .../src/lib/trace-read-model.test.ts | 139 +++++ apps/dashboard/src/lib/trace-read-model.ts | 490 ++++++++++++++++++ apps/dashboard/src/lib/types.ts | 97 ++++ 4 files changed, 890 insertions(+) create mode 100644 apps/dashboard/src/lib/__fixtures__/trace-session-read-model.ts create mode 100644 apps/dashboard/src/lib/trace-read-model.test.ts create mode 100644 apps/dashboard/src/lib/trace-read-model.ts diff --git a/apps/dashboard/src/lib/__fixtures__/trace-session-read-model.ts b/apps/dashboard/src/lib/__fixtures__/trace-session-read-model.ts new file mode 100644 index 000000000..f6fb19680 --- /dev/null +++ b/apps/dashboard/src/lib/__fixtures__/trace-session-read-model.ts @@ -0,0 +1,164 @@ +export const traceSessionEnvelopeFixture = { + schema_version: 'agentv.trace.v1', + artifact_id: 'execution-trace-fixture', + created_at: '2026-06-21T10:00:00.000Z', + eval: { + run_id: '2026-06-21T10-00-00-000Z', + test_id: 'nested-session', + suite: 'evals/github-backed.eval.yaml', + target: 'codex', + }, + trace: { + format: 'otlp_openinference_spans', + trace_id: 'trace-123', + root_span_id: 'root-span', + spans: [ + { + trace_id: 'trace-123', + span_id: 'root-span', + parent_span_id: null, + name: 'invoke_agent codex', + kind: 'INTERNAL', + start_time_unix_nano: '1000000000', + end_time_unix_nano: '2500000000', + status: { code: 'OK' }, + attributes: { + 'agentv.test_id': 'nested-session', + 'agentv.target': 'codex', + 'custom.unknown_value': { nested_value: true }, + 'gen_ai.usage.input_tokens': 14, + 'gen_ai.usage.output_tokens': 9, + }, + events: [ + { + name: 'agentv.annotation', + time_unix_nano: '1200000000', + attributes: { + event_id: 'annotation-1', + text: 'Reviewer note', + passed: true, + extra_context: { source: 'grader' }, + }, + }, + { + name: 'agentv.score', + time_unix_nano: '2300000000', + attributes: { + event_id: 'score-1', + score: 0.82, + text: 'Rubric score', + passed: true, + }, + }, + ], + }, + { + trace_id: 'trace-123', + span_id: 'child-chat', + parent_span_id: 'root-span', + name: 'chat gpt-5-codex', + kind: 'INTERNAL', + start_time_unix_nano: '1300000000', + end_time_unix_nano: '2200000000', + status: { code: 'OK' }, + attributes: { + 'gen_ai.operation.name': 'chat', + 'openinference.span.kind': 'LLM', + }, + events: [], + }, + { + trace_id: 'trace-123', + span_id: 'grandchild-tool', + parent_span_id: 'child-chat', + name: 'execute_tool read_file', + kind: 'INTERNAL', + start_time_unix_nano: '1500000000', + end_time_unix_nano: '1700000000', + status: { code: 'OK' }, + attributes: { + 'gen_ai.tool.name': 'read_file', + 'tool.name': 'read_file', + }, + events: [], + }, + ], + }, + source: { + kind: 'agentv_run', + path: 'index.jsonl', + provider: 'codex', + format: 'agentv_result', + version: '1', + metadata: { + external_trace: { + provider: 'phoenix', + project: 'agentv-dogfood', + session_id: 'codex-session-123', + trace_id: 'phoenix-trace-456', + url: 'https://phoenix.example/projects/agentv-dogfood/traces/phoenix-trace-456?api_key=secret', + api_key: 'secret', + }, + safe_note: 'local artifact remains canonical', + access_token: 'secret', + }, + }, + scores: [ + { + name: 'rubric', + type: 'llm-grader', + score: 0.82, + weight: 1, + verdict: 'pass', + source: 'llm', + evaluated_at: '2026-06-21T10:00:02.300Z', + target_span_id: 'root-span', + evidence: { + assertions: [{ text: 'Rubric score', passed: true }], + }, + }, + ], +}; + +export const traceSessionMissingOptionalFixture = { + schema_version: 'agentv.trace.v1', + artifact_id: 'execution-trace-missing-optionals', + created_at: '2026-06-21T10:05:00.000Z', + eval: { + run_id: '2026-06-21T10-05-00-000Z', + test_id: 'missing-optionals', + target: 'codex', + }, + trace: { + format: 'otlp_openinference_spans', + trace_id: 'trace-missing', + root_span_id: 'root-missing', + spans: [ + { + trace_id: 'trace-missing', + span_id: 'root-missing', + parent_span_id: null, + name: 'invoke_agent codex', + kind: 'INTERNAL', + status: { code: 'OK' }, + attributes: { + 'agentv.test_id': 'missing-optionals', + }, + events: [], + }, + ], + }, + source: { + kind: 'agentv_run', + path: 'index.jsonl', + provider: 'codex', + metadata: { + external_trace: { + provider: 'codex', + session_id: 'codex-session-789', + url: 'not-a-url', + token: 'secret', + }, + }, + }, +}; diff --git a/apps/dashboard/src/lib/trace-read-model.test.ts b/apps/dashboard/src/lib/trace-read-model.test.ts new file mode 100644 index 000000000..474573813 --- /dev/null +++ b/apps/dashboard/src/lib/trace-read-model.test.ts @@ -0,0 +1,139 @@ +import { describe, expect, it } from 'bun:test'; + +import { + traceSessionEnvelopeFixture, + traceSessionMissingOptionalFixture, +} from './__fixtures__/trace-session-read-model'; +import { buildTraceSpanTree, traceEnvelopeToTraceSessionResponse } from './trace-read-model'; + +function expectSnakeCaseFixtureKeys(value: unknown, path: string[] = []): void { + if (Array.isArray(value)) { + value.forEach((entry, index) => expectSnakeCaseFixtureKeys(entry, [...path, String(index)])); + return; + } + if (!value || typeof value !== 'object') { + return; + } + + for (const [key, entry] of Object.entries(value)) { + const parentKey = path.at(-1); + if (parentKey !== 'attributes') { + expect(key, [...path, key].join('.')).toMatch(/^[a-z][a-z0-9_]*$/); + } + expectSnakeCaseFixtureKeys(entry, [...path, key]); + } +} + +describe('trace session read model', () => { + it('projects snake_case trace artifacts into stable Dashboard span trees', () => { + const session = traceEnvelopeToTraceSessionResponse(traceSessionEnvelopeFixture, { + artifactPath: 'nested-session__codex/outputs/trace.json', + }); + const tree = buildTraceSpanTree(session.spans); + + expect(session).toMatchObject({ + schema_version: 'agentv.dashboard.trace_session.v1', + run_id: '2026-06-21T10-00-00-000Z', + test_id: 'nested-session', + target: 'codex', + trace_id: 'trace-123', + root_span_id: 'root-span', + source: { + artifact_path: 'nested-session__codex/outputs/trace.json', + }, + }); + expect(session.spans.map((span) => span.id)).toEqual([ + 'root-span', + 'child-chat', + 'grandchild-tool', + ]); + expect(session.spans.map((span) => span.parent_span_id)).toEqual([ + null, + 'root-span', + 'child-chat', + ]); + expect(tree).toHaveLength(1); + expect(tree[0].spanId).toBe('root-span'); + expect(tree[0].children[0].spanId).toBe('child-chat'); + expect(tree[0].children[0].children[0].spanId).toBe('grandchild-tool'); + }); + + it('preserves score events, annotation events, scores, and unknown attributes', () => { + const session = traceEnvelopeToTraceSessionResponse(traceSessionEnvelopeFixture); + const root = session.spans.find((span) => span.span_id === 'root-span'); + + expect(root?.duration_ms).toBe(1500); + expect(root?.token_usage).toEqual({ input: 14, output: 9 }); + expect(root?.attributes?.['custom.unknown_value']).toEqual({ nested_value: true }); + + expect(session.events.map((event) => [event.event_id, event.kind, event.name])).toEqual([ + ['annotation-1', 'annotation', 'agentv.annotation'], + ['score-1', 'score', 'agentv.score'], + ]); + expect(session.events[0]).toMatchObject({ + text: 'Reviewer note', + passed: true, + attributes: { extra_context: { source: 'grader' } }, + }); + expect(session.events[1]).toMatchObject({ + score: 0.82, + text: 'Rubric score', + passed: true, + }); + expect(session.scores).toEqual([ + { + name: 'rubric', + type: 'llm-grader', + score: 0.82, + weight: 1, + verdict: 'pass', + source: 'llm', + evaluated_at: '2026-06-21T10:00:02.300Z', + target_span_id: 'root-span', + evidence: { + assertions: [{ text: 'Rubric score', passed: true }], + }, + }, + ]); + }); + + it('keeps external_trace links safe and leaves AgentV as canonical source', () => { + const session = traceEnvelopeToTraceSessionResponse(traceSessionEnvelopeFixture); + + expect(session.external_trace).toEqual({ + provider: 'phoenix', + project: 'agentv-dogfood', + session_id: 'codex-session-123', + trace_id: 'phoenix-trace-456', + url: 'https://phoenix.example/projects/agentv-dogfood/traces/phoenix-trace-456', + }); + expect(JSON.stringify(session.external_trace)).not.toContain('secret'); + expect(JSON.stringify(session.external_trace)).not.toContain('api_key'); + expect(session.source?.metadata).toEqual({ + safe_note: 'local artifact remains canonical', + }); + }); + + it('does not invent zero timing, token usage, or broken external links for missing fields', () => { + const session = traceEnvelopeToTraceSessionResponse(traceSessionMissingOptionalFixture); + const root = session.spans[0]; + + expect(root.start_time_unix_nano).toBeUndefined(); + expect(root.end_time_unix_nano).toBeUndefined(); + expect(root.start_time).toBeUndefined(); + expect(root.end_time).toBeUndefined(); + expect(root.duration_ms).toBeUndefined(); + expect(root.token_usage).toBeUndefined(); + expect(session.external_trace).toEqual({ + provider: 'codex', + session_id: 'codex-session-789', + }); + expect(JSON.stringify(session.external_trace)).not.toContain('secret'); + expect(JSON.stringify(session.external_trace)).not.toContain('not-a-url'); + }); + + it('keeps new API fixtures snake_case-only outside opaque attributes maps', () => { + expectSnakeCaseFixtureKeys(traceSessionEnvelopeFixture); + expectSnakeCaseFixtureKeys(traceSessionMissingOptionalFixture); + }); +}); diff --git a/apps/dashboard/src/lib/trace-read-model.ts b/apps/dashboard/src/lib/trace-read-model.ts new file mode 100644 index 000000000..cc1b16d70 --- /dev/null +++ b/apps/dashboard/src/lib/trace-read-model.ts @@ -0,0 +1,490 @@ +import type { + ExternalTraceMetadata, + TraceSessionEvent, + TraceSessionEventKind, + TraceSessionResponse, + TraceSessionScore, + TraceSessionSource, + TraceSessionSpan, + TraceSessionTokenUsage, +} from './types'; + +export const TRACE_SESSION_SCHEMA_VERSION = 'agentv.dashboard.trace_session.v1' as const; + +export interface TraceSessionProjectionOptions { + runId?: string; + artifactPath?: string; +} + +export interface TraceSpanNode { + id: string; + spanId: string; + parentSpanId?: string | null; + span: TraceSessionSpan; + children: TraceSpanNode[]; +} + +function isRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value); +} + +function asRecord(value: unknown): Record | undefined { + return isRecord(value) ? value : undefined; +} + +function asArray(value: unknown): unknown[] { + return Array.isArray(value) ? value : []; +} + +function stringValue(value: unknown): string | undefined { + return typeof value === 'string' && value.length > 0 ? value : undefined; +} + +function finiteNumber(value: unknown): number | undefined { + return typeof value === 'number' && Number.isFinite(value) ? value : undefined; +} + +function boolValue(value: unknown): boolean | undefined { + return typeof value === 'boolean' ? value : undefined; +} + +function dropUndefined>(value: T): T { + return Object.fromEntries(Object.entries(value).filter(([, entry]) => entry !== undefined)) as T; +} + +function compactRecord(value: Record): Record | undefined { + const compacted = dropUndefined(value); + return Object.keys(compacted).length > 0 ? compacted : undefined; +} + +function unixNanoToIso(value: string | undefined): string | undefined { + if (!value) { + return undefined; + } + try { + return new Date(Number(BigInt(value) / 1_000_000n)).toISOString(); + } catch { + return undefined; + } +} + +function durationMsFromNanos( + start: string | undefined, + end: string | undefined, +): number | undefined { + if (!start || !end) { + return undefined; + } + try { + const startNanos = BigInt(start); + const endNanos = BigInt(end); + if (endNanos < startNanos) { + return undefined; + } + return Number(endNanos - startNanos) / 1_000_000; + } catch { + return undefined; + } +} + +function numberFromAttributes( + attributes: Record, + keys: readonly string[], +): number | undefined { + for (const key of keys) { + const value = finiteNumber(attributes[key]); + if (value !== undefined) { + return value; + } + } + return undefined; +} + +function tokenUsageFromAttributes( + attributes: Record | undefined, +): TraceSessionTokenUsage | undefined { + if (!attributes) { + return undefined; + } + + const nested = asRecord(attributes.token_usage); + const usage = compactRecord({ + input: + finiteNumber(nested?.input) ?? + numberFromAttributes(attributes, [ + 'gen_ai.usage.input_tokens', + 'llm.token_count.prompt', + 'input_tokens', + ]), + output: + finiteNumber(nested?.output) ?? + numberFromAttributes(attributes, [ + 'gen_ai.usage.output_tokens', + 'llm.token_count.completion', + 'output_tokens', + ]), + reasoning: + finiteNumber(nested?.reasoning) ?? + numberFromAttributes(attributes, [ + 'gen_ai.usage.reasoning.output_tokens', + 'reasoning_tokens', + ]), + cached: + finiteNumber(nested?.cached) ?? + numberFromAttributes(attributes, ['gen_ai.usage.cache_read.input_tokens', 'cached_tokens']), + total: finiteNumber(nested?.total) ?? numberFromAttributes(attributes, ['total_tokens']), + }); + + return usage as TraceSessionTokenUsage | undefined; +} + +function spanStatusFromValue(value: unknown): TraceSessionSpan['status'] { + const record = asRecord(value); + if (!record) { + return undefined; + } + return compactRecord({ + code: + stringValue(record.code) ?? + (typeof record.code === 'number' ? String(record.code) : undefined), + message: stringValue(record.message), + }) as TraceSessionSpan['status']; +} + +function eventKind( + name: string, + attributes: Record | undefined, +): TraceSessionEventKind { + const lowerName = name.toLowerCase(); + if ( + lowerName.includes('score') || + finiteNumber(attributes?.score) !== undefined || + finiteNumber(attributes?.['agentv.score']) !== undefined || + finiteNumber(attributes?.['agentv.grader.score']) !== undefined + ) { + return 'score'; + } + if ( + lowerName.includes('annotation') || + stringValue(attributes?.text) !== undefined || + stringValue(attributes?.annotation) !== undefined || + stringValue(attributes?.['agentv.annotation.text']) !== undefined + ) { + return 'annotation'; + } + if (lowerName === 'exception') { + return 'exception'; + } + return 'event'; +} + +function scoreFromEvent(attributes: Record | undefined): number | undefined { + if (!attributes) { + return undefined; + } + return ( + finiteNumber(attributes.score) ?? + finiteNumber(attributes['agentv.score']) ?? + finiteNumber(attributes['agentv.grader.score']) + ); +} + +function textFromEvent(attributes: Record | undefined): string | undefined { + if (!attributes) { + return undefined; + } + return ( + stringValue(attributes.text) ?? + stringValue(attributes.annotation) ?? + stringValue(attributes['agentv.annotation.text']) ?? + stringValue(attributes['exception.message']) + ); +} + +function passedFromEvent(attributes: Record | undefined): boolean | undefined { + if (!attributes) { + return undefined; + } + return boolValue(attributes.passed) ?? boolValue(attributes['agentv.annotation.passed']); +} + +function eventId( + spanId: string, + index: number, + attributes: Record | undefined, +): string { + return ( + stringValue(attributes?.event_id) ?? + stringValue(attributes?.['agentv.event_id']) ?? + `${spanId}:event:${index}` + ); +} + +function projectSpanEvent( + spanId: string, + event: unknown, + index: number, +): TraceSessionEvent | undefined { + const record = asRecord(event); + if (!record) { + return undefined; + } + const name = stringValue(record.name); + if (!name) { + return undefined; + } + + const attributes = asRecord(record.attributes); + return dropUndefined({ + event_id: eventId(spanId, index, attributes), + span_id: spanId, + name, + kind: eventKind(name, attributes), + time_unix_nano: stringValue(record.time_unix_nano), + timestamp: unixNanoToIso(stringValue(record.time_unix_nano)), + score: scoreFromEvent(attributes), + text: textFromEvent(attributes), + passed: passedFromEvent(attributes), + attributes, + }); +} + +function projectSpan(span: unknown, index: number): TraceSessionSpan | undefined { + const record = asRecord(span); + if (!record) { + return undefined; + } + + const spanId = stringValue(record.span_id) ?? `span-${index}`; + const traceId = stringValue(record.trace_id); + const parentSpanId = record.parent_span_id === null ? null : stringValue(record.parent_span_id); + const attributes = asRecord(record.attributes); + const startTimeUnixNano = stringValue(record.start_time_unix_nano); + const endTimeUnixNano = stringValue(record.end_time_unix_nano); + const events = asArray(record.events) + .map((event, eventIndex) => projectSpanEvent(spanId, event, eventIndex)) + .filter((event): event is TraceSessionEvent => event !== undefined); + + return dropUndefined({ + id: spanId, + trace_id: traceId, + span_id: spanId, + parent_span_id: parentSpanId, + name: stringValue(record.name) ?? spanId, + kind: stringValue(record.kind), + status: spanStatusFromValue(record.status), + start_time_unix_nano: startTimeUnixNano, + end_time_unix_nano: endTimeUnixNano, + start_time: unixNanoToIso(startTimeUnixNano), + end_time: unixNanoToIso(endTimeUnixNano), + duration_ms: durationMsFromNanos(startTimeUnixNano, endTimeUnixNano), + token_usage: tokenUsageFromAttributes(attributes), + attributes, + events: events.length > 0 ? events : undefined, + }); +} + +function projectScores(scores: unknown): TraceSessionScore[] | undefined { + const projected: TraceSessionScore[] = []; + + for (const score of asArray(scores)) { + const record = asRecord(score); + const name = stringValue(record?.name); + const value = finiteNumber(record?.score); + if (!record || !name || value === undefined) { + continue; + } + projected.push( + dropUndefined({ + name, + type: stringValue(record.type), + score: value, + weight: finiteNumber(record.weight), + verdict: stringValue(record.verdict), + source: stringValue(record.source), + evaluated_at: stringValue(record.evaluated_at), + target_span_id: stringValue(record.target_span_id), + evidence: asRecord(record.evidence), + }) as TraceSessionScore, + ); + } + + return projected.length > 0 ? projected : undefined; +} + +const EXTERNAL_TRACE_KEYS = ['provider', 'project', 'session_id', 'trace_id', 'url'] as const; + +function isSecretLikeKey(key: string): boolean { + return /(api[_-]?key|authorization|bearer|password|secret|token)/i.test(key); +} + +function sanitizeUrl(value: unknown): string | undefined { + const raw = stringValue(value); + if (!raw) { + return undefined; + } + try { + const url = new URL(raw); + if (!['http:', 'https:'].includes(url.protocol) || url.username || url.password) { + return undefined; + } + url.search = ''; + url.hash = ''; + return url.toString(); + } catch { + return undefined; + } +} + +function sanitizeExternalTrace(value: unknown): ExternalTraceMetadata | undefined { + const record = asRecord(value); + if (!record) { + return undefined; + } + + const sanitized = compactRecord({ + provider: stringValue(record.provider), + project: stringValue(record.project), + session_id: stringValue(record.session_id), + trace_id: stringValue(record.trace_id), + url: sanitizeUrl(record.url), + }) as ExternalTraceMetadata | undefined; + + return sanitized && EXTERNAL_TRACE_KEYS.some((key) => sanitized[key] !== undefined) + ? sanitized + : undefined; +} + +function externalTraceFromFlatMetadata( + metadata: Record | undefined, +): ExternalTraceMetadata | undefined { + if (!metadata) { + return undefined; + } + return sanitizeExternalTrace({ + provider: metadata.external_trace_provider ?? metadata['external_trace.provider'], + project: metadata.external_trace_project ?? metadata['external_trace.project'], + session_id: metadata.external_trace_session_id ?? metadata['external_trace.session_id'], + trace_id: metadata.external_trace_trace_id ?? metadata['external_trace.trace_id'], + url: metadata.external_trace_url ?? metadata['external_trace.url'], + }); +} + +function sanitizeMetadata( + value: Record | undefined, +): Record | undefined { + if (!value) { + return undefined; + } + const entries = Object.entries(value).flatMap(([key, entry]) => { + if ( + key === 'external_trace' || + key.startsWith('external_trace_') || + key.startsWith('external_trace.') || + isSecretLikeKey(key) + ) { + return []; + } + if (isRecord(entry)) { + const nested = sanitizeMetadata(entry); + return nested ? [[key, nested] as const] : []; + } + return [[key, entry] as const]; + }); + return entries.length > 0 ? Object.fromEntries(entries) : undefined; +} + +function sourceFromEnvelope( + source: Record | undefined, + artifactPath: string | undefined, +): TraceSessionSource | undefined { + if (!source && !artifactPath) { + return undefined; + } + return compactRecord({ + kind: stringValue(source?.kind), + path: stringValue(source?.path), + provider: stringValue(source?.provider), + format: stringValue(source?.format), + version: stringValue(source?.version), + artifact_path: artifactPath, + metadata: sanitizeMetadata(asRecord(source?.metadata)), + }) as TraceSessionSource | undefined; +} + +function externalTraceFromEnvelope( + envelope: Record, +): ExternalTraceMetadata | undefined { + const source = asRecord(envelope.source); + const sourceMetadata = asRecord(source?.metadata); + const trace = asRecord(envelope.trace); + const rootSpanId = stringValue(trace?.root_span_id); + const rootSpan = asArray(trace?.spans) + .map(asRecord) + .find((span) => stringValue(span?.span_id) === rootSpanId); + const rootAttributes = asRecord(rootSpan?.attributes); + + return ( + sanitizeExternalTrace(envelope.external_trace) ?? + sanitizeExternalTrace(sourceMetadata?.external_trace) ?? + externalTraceFromFlatMetadata(sourceMetadata) ?? + externalTraceFromFlatMetadata(rootAttributes) + ); +} + +export function traceEnvelopeToTraceSessionResponse( + input: unknown, + options: TraceSessionProjectionOptions = {}, +): TraceSessionResponse { + const envelope = asRecord(input) ?? {}; + const evaluation = asRecord(envelope.eval); + const trace = asRecord(envelope.trace); + const spans = asArray(trace?.spans) + .map(projectSpan) + .filter((span): span is TraceSessionSpan => span !== undefined); + const events = spans.flatMap((span) => span.events ?? []); + + return dropUndefined({ + schema_version: TRACE_SESSION_SCHEMA_VERSION, + artifact_id: stringValue(envelope.artifact_id), + created_at: stringValue(envelope.created_at), + run_id: options.runId ?? stringValue(evaluation?.run_id), + test_id: stringValue(evaluation?.test_id), + suite: stringValue(evaluation?.suite), + target: stringValue(evaluation?.target), + trace_id: stringValue(trace?.trace_id), + root_span_id: stringValue(trace?.root_span_id), + source: sourceFromEnvelope(asRecord(envelope.source), options.artifactPath), + external_trace: externalTraceFromEnvelope(envelope), + spans, + events, + scores: projectScores(envelope.scores), + }); +} + +export function buildTraceSpanTree(spans: readonly TraceSessionSpan[]): TraceSpanNode[] { + const nodes = new Map(); + const roots: TraceSpanNode[] = []; + + for (const span of spans) { + nodes.set(span.span_id, { + id: span.id, + spanId: span.span_id, + parentSpanId: span.parent_span_id, + span, + children: [], + }); + } + + for (const node of nodes.values()) { + const parentId = typeof node.parentSpanId === 'string' ? node.parentSpanId : undefined; + const parent = parentId ? nodes.get(parentId) : undefined; + if (parent) { + parent.children.push(node); + } else { + roots.push(node); + } + } + + return roots; +} diff --git a/apps/dashboard/src/lib/types.ts b/apps/dashboard/src/lib/types.ts index 087836a37..0a6fd8453 100644 --- a/apps/dashboard/src/lib/types.ts +++ b/apps/dashboard/src/lib/types.ts @@ -120,6 +120,103 @@ export interface SourceTraceability { referenced_files?: SourceReferencedFile[]; } +export interface ExternalTraceMetadata { + /** + * Optional external viewer reference only. AgentV run artifacts remain the + * canonical source of truth for Dashboard trace/session details. + */ + provider?: string; + project?: string; + session_id?: string; + trace_id?: string; + url?: string; +} + +export interface TraceSessionTokenUsage { + input?: number; + output?: number; + reasoning?: number; + cached?: number; + total?: number; +} + +export interface TraceSessionSpanStatus { + code?: string; + message?: string; +} + +export type TraceSessionEventKind = 'annotation' | 'exception' | 'event' | 'score'; + +export interface TraceSessionEvent { + event_id: string; + span_id: string; + name: string; + kind: TraceSessionEventKind; + time_unix_nano?: string; + timestamp?: string; + score?: number; + text?: string; + passed?: boolean; + attributes?: Record; +} + +export interface TraceSessionSpan { + id: string; + trace_id?: string; + span_id: string; + parent_span_id?: string | null; + name: string; + kind?: string; + status?: TraceSessionSpanStatus; + start_time_unix_nano?: string; + end_time_unix_nano?: string; + start_time?: string; + end_time?: string; + duration_ms?: number; + token_usage?: TraceSessionTokenUsage; + attributes?: Record; + events?: TraceSessionEvent[]; +} + +export interface TraceSessionScore { + name: string; + type?: string; + score: number; + weight?: number; + verdict?: string; + source?: string; + evaluated_at?: string; + target_span_id?: string; + evidence?: Record; +} + +export interface TraceSessionSource { + kind?: string; + path?: string; + provider?: string; + format?: string; + version?: string; + artifact_path?: string; + metadata?: Record; +} + +export interface TraceSessionResponse { + schema_version: 'agentv.dashboard.trace_session.v1'; + artifact_id?: string; + created_at?: string; + run_id?: string; + test_id?: string; + suite?: string; + target?: string; + trace_id?: string; + root_span_id?: string; + source?: TraceSessionSource; + external_trace?: ExternalTraceMetadata; + spans: TraceSessionSpan[]; + events: TraceSessionEvent[]; + scores?: TraceSessionScore[]; +} + export interface EvalResult { testId: string; timestamp?: string; From b20f17f3a942a972b106492bb8ae1d35dd8b271d Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 21 Jun 2026 05:27:29 +0200 Subject: [PATCH 06/21] feat(results): add projection bundle export --- apps/cli/src/commands/results/export.ts | 54 ++- .../src/commands/results/projection-bundle.ts | 369 ++++++++++++++++++ apps/cli/test/commands/results/export.test.ts | 126 ++++++ .../src/content/docs/docs/tools/results.mdx | 39 ++ 4 files changed, 586 insertions(+), 2 deletions(-) create mode 100644 apps/cli/src/commands/results/projection-bundle.ts diff --git a/apps/cli/src/commands/results/export.ts b/apps/cli/src/commands/results/export.ts index 76139169e..21120c27e 100644 --- a/apps/cli/src/commands/results/export.ts +++ b/apps/cli/src/commands/results/export.ts @@ -23,12 +23,17 @@ import path from 'node:path'; -import { command, oneOf, option, optional, positional, string } from 'cmd-ts'; +import { command, flag, oneOf, option, optional, positional, string } from 'cmd-ts'; import type { EvaluationResult, ExportDuplicatePolicy } from '@agentv/core'; import { parseJsonlResults, writeArtifactsFromResults } from '../eval/artifact-writer.js'; import { RESULT_INDEX_FILENAME } from '../eval/result-layout.js'; +import { + buildProjectionBundle, + serializeProjectionBundle, + writeProjectionBundle, +} from './projection-bundle.js'; import { loadResults as loadSharedResults, resolveSourceFile } from './shared.js'; // ── Export logic ───────────────────────────────────────────────────────── @@ -122,10 +127,34 @@ export const resultsExportCommand = command({ description: 'How to handle duplicate projection identities in the output: update (default), skip, or error', }), + projectionBundle: flag({ + long: 'projection-bundle', + description: 'Write a vendor-neutral projection_bundle.json alongside exported artifacts', + }), + dryRun: flag({ + long: 'dry-run', + description: 'Print deterministic projection bundle JSON without writing export artifacts', + }), + includeRawContent: flag({ + long: 'include-raw-content', + description: + 'Include raw prompt, output, and tool payload content in the projection bundle (off by default)', + }), }, - handler: async ({ source, out, dir, duplicatePolicy }) => { + handler: async ({ + source, + out, + dir, + duplicatePolicy, + projectionBundle, + dryRun, + includeRawContent, + }) => { const cwd = dir ?? process.cwd(); const policy = (duplicatePolicy ?? 'update') as ExportDuplicatePolicy; + const shouldWriteProjectionBundle = projectionBundle; + const shouldDryRun = dryRun; + const shouldIncludeRawContent = includeRawContent; try { const { sourceFile, results } = await loadExportSource(source, cwd); @@ -136,14 +165,35 @@ export const resultsExportCommand = command({ : path.resolve(cwd, out) : deriveOutputDir(cwd, sourceFile); + const buildBundle = () => + buildProjectionBundle(results, { + sourceFile, + runId: deriveExportRunId(sourceFile), + cwd, + duplicatePolicy: policy, + includeRawContent: shouldIncludeRawContent, + }); + + if (shouldDryRun) { + process.stdout.write(serializeProjectionBundle(buildBundle())); + return; + } + await writeArtifactsFromResults(results, outputDir, { evalFile: sourceFile, runId: deriveExportRunId(sourceFile), duplicatePolicy: policy, }); + const bundlePath = shouldWriteProjectionBundle + ? await writeProjectionBundle(buildBundle(), outputDir) + : undefined; + // Report exported test IDs console.log(`Exported ${results.length} test(s) to ${outputDir}`); + if (bundlePath) { + console.log(`Projection bundle written to ${bundlePath}`); + } for (const result of results) { console.log(` ${result.testId ?? 'unknown'}`); } diff --git a/apps/cli/src/commands/results/projection-bundle.ts b/apps/cli/src/commands/results/projection-bundle.ts new file mode 100644 index 000000000..b955af27c --- /dev/null +++ b/apps/cli/src/commands/results/projection-bundle.ts @@ -0,0 +1,369 @@ +/** + * Vendor-neutral projection bundle for completed AgentV runs. + * + * This file builds a deterministic, local JSON contract that adapter workers + * can consume without calling vendor SDKs. The bundle keeps AgentV artifacts as + * the source of truth, includes metadata-only OpenInference-shaped spans by + * default, and requires explicit opt-in before raw prompt/output/tool payloads + * are copied into the bundle. + */ + +import { createHash } from 'node:crypto'; +import { mkdir, writeFile } from 'node:fs/promises'; +import path from 'node:path'; + +import { + type EvaluationResult, + type ExportDuplicatePolicy, + type IndexArtifactEntry, + type ProjectionIdentityWire, + type TraceEnvelopeCaptureWire, + type TraceEnvelopeConversionWarningWire, + type TraceEnvelopeScoreWire, + type TraceEnvelopeWire, + buildResultIndexArtifact, + buildTraceEnvelopeFromEvaluationResult, + toTraceEnvelopeWire, +} from '@agentv/core'; + +export const PROJECTION_BUNDLE_FILENAME = 'projection_bundle.json'; +export const PROJECTION_BUNDLE_SCHEMA_VERSION = 'agentv.projection_bundle.v1'; + +type JsonRecord = Record; + +export interface ProjectionBundle { + readonly schema_version: typeof PROJECTION_BUNDLE_SCHEMA_VERSION; + readonly bundle_id: string; + readonly created_at: string; + readonly source: { + readonly kind: 'agentv_run'; + readonly path: string; + readonly run_id: string; + readonly result_count: number; + }; + readonly content_policy: { + readonly raw_content: 'excluded' | 'included'; + readonly raw_content_opt_in: boolean; + readonly default_capture: 'metadata' | 'full'; + readonly backend_anonymizer_boundary: 'adapter'; + }; + readonly capture_summary: TraceEnvelopeCaptureWire; + readonly entries: readonly ProjectionBundleEntry[]; + readonly conversion_warnings?: readonly TraceEnvelopeConversionWarningWire[]; +} + +export interface ProjectionBundleEntry { + readonly projection_id: string; + readonly projection_identity: ProjectionIdentityWire; + readonly eval: TraceEnvelopeWire['eval']; + readonly artifact_refs: ProjectionBundleArtifactRefs; + readonly trace: { + readonly format: TraceEnvelopeWire['trace']['format']; + readonly trace_id: string; + readonly root_span_id: string; + readonly span_count: number; + readonly envelope_ref?: string; + }; + readonly trace_envelope: TraceEnvelopeWire; + readonly feedback: { + readonly source: 'agentv_grading_artifacts'; + readonly result_score: number; + readonly execution_status?: string; + readonly grading_path?: string; + readonly timing_path?: string; + readonly assertion_count: number; + readonly scores?: readonly TraceEnvelopeScoreWire[]; + }; + readonly capture: TraceEnvelopeCaptureWire; + readonly conversion_warnings?: readonly TraceEnvelopeConversionWarningWire[]; + readonly raw_content?: { + readonly input?: unknown; + readonly output?: string; + readonly trace_messages?: unknown; + }; +} + +export type ProjectionBundleArtifactRefs = Partial< + Pick< + IndexArtifactEntry, + | 'artifact_dir' + | 'grading_path' + | 'timing_path' + | 'input_path' + | 'output_path' + | 'answer_path' + | 'response_path' + | 'transcript_path' + | 'task_dir' + | 'eval_path' + | 'targets_path' + | 'files_path' + | 'graders_path' + > & { readonly trace_path: string } +>; + +export interface BuildProjectionBundleOptions { + readonly sourceFile: string; + readonly runId: string; + readonly cwd?: string; + readonly includeRawContent?: boolean; + readonly duplicatePolicy?: ExportDuplicatePolicy; +} + +function dropUndefined(value: T): T { + return Object.fromEntries( + Object.entries(value).filter(([, entryValue]) => entryValue !== undefined), + ) as T; +} + +function toPortablePath(filePath: string, cwd?: string): string { + const absolutePath = path.resolve(filePath); + const absoluteCwd = path.resolve(cwd ?? process.cwd()); + const relative = path.relative(absoluteCwd, absolutePath); + const portable = + relative && !relative.startsWith('..') && !path.isAbsolute(relative) ? relative : absolutePath; + return portable.split(path.sep).join('/'); +} + +function stableDate(value: string | undefined): Date { + const parsed = value ? Date.parse(value) : Number.NaN; + return Number.isFinite(parsed) ? new Date(parsed) : new Date(0); +} + +function bundleCreatedAt(results: readonly EvaluationResult[]): string { + const timestamps = results + .map((result) => stableDate(result.timestamp).toISOString()) + .sort((a, b) => a.localeCompare(b)); + return timestamps[0] ?? new Date(0).toISOString(); +} + +function shortHash(parts: readonly string[], length = 20): string { + return createHash('sha256').update(parts.join('\n')).digest('hex').slice(0, length); +} + +function tracePathFor(indexEntry: IndexArtifactEntry): string | undefined { + return indexEntry.artifact_dir + ? path.posix.join(indexEntry.artifact_dir, 'outputs', 'trace.json') + : undefined; +} + +function artifactRefs(indexEntry: IndexArtifactEntry): ProjectionBundleArtifactRefs { + return dropUndefined({ + artifact_dir: indexEntry.artifact_dir, + grading_path: indexEntry.grading_path, + timing_path: indexEntry.timing_path, + input_path: indexEntry.input_path, + output_path: indexEntry.output_path, + answer_path: indexEntry.answer_path, + response_path: indexEntry.response_path, + transcript_path: indexEntry.transcript_path, + trace_path: tracePathFor(indexEntry), + task_dir: indexEntry.task_dir, + eval_path: indexEntry.eval_path, + targets_path: indexEntry.targets_path, + files_path: indexEntry.files_path, + graders_path: indexEntry.graders_path, + }); +} + +function removeTranscriptMessageMetadata(envelope: TraceEnvelopeWire): TraceEnvelopeWire { + return { + ...envelope, + trace: { + ...envelope.trace, + spans: envelope.trace.spans.map((span) => ({ + ...span, + events: span.events?.map((event) => { + const transcriptMessage = event.attributes?.['agentv.transcript.message']; + if ( + !transcriptMessage || + typeof transcriptMessage !== 'object' || + Array.isArray(transcriptMessage) + ) { + return event; + } + const { metadata: _metadata, ...safeMessage } = transcriptMessage as JsonRecord; + return { + ...event, + attributes: { + ...event.attributes, + 'agentv.transcript.message': safeMessage, + }, + }; + }), + })), + }, + }; +} + +function safeEnvelope( + envelope: TraceEnvelopeWire, + options: { includeRawContent: boolean }, +): TraceEnvelopeWire { + if (options.includeRawContent) { + return envelope; + } + + const withoutRawEvidence = removeTranscriptMessageMetadata({ + ...envelope, + source: { + ...envelope.source, + metadata: undefined, + }, + scores: envelope.scores?.map(({ evidence: _evidence, ...score }) => score), + }); + + return JSON.parse(JSON.stringify(withoutRawEvidence)) as TraceEnvelopeWire; +} + +function safeScores( + scores: readonly TraceEnvelopeScoreWire[] | undefined, + options: { includeRawContent: boolean }, +): readonly TraceEnvelopeScoreWire[] | undefined { + if (!scores) { + return undefined; + } + return options.includeRawContent + ? scores + : scores.map(({ evidence: _evidence, ...score }) => score); +} + +function captureOptions(includeRawContent: boolean) { + return includeRawContent + ? { content: 'full' as const, redactionLevel: 'none' as const, redactedFields: [] } + : undefined; +} + +function rawContent(result: EvaluationResult): ProjectionBundleEntry['raw_content'] { + return dropUndefined({ + input: result.input, + output: result.output, + trace_messages: result.trace.messages, + }); +} + +function buildEntry( + result: EvaluationResult, + options: BuildProjectionBundleOptions, +): ProjectionBundleEntry { + const includeRawContent = options.includeRawContent ?? false; + const sourcePath = toPortablePath(options.sourceFile, options.cwd); + const envelope = buildTraceEnvelopeFromEvaluationResult(result, { + evalPath: sourcePath, + runId: options.runId, + source: { kind: 'agentv_run', path: sourcePath, format: 'agentv_result' }, + artifacts: { + trace_path: tracePathFor(buildResultIndexArtifact(result)), + answer_path: result.output.length > 0 ? 'outputs/answer.md' : undefined, + response_path: result.output.length > 0 ? 'outputs/response.md' : undefined, + }, + duplicatePolicy: options.duplicatePolicy, + capture: captureOptions(includeRawContent), + now: () => stableDate(result.timestamp), + }); + const projectionIdentity = envelope.projectionIdentity; + if (!projectionIdentity) { + throw new Error(`Result ${result.testId ?? 'unknown'} is missing projection identity`); + } + + const indexEntry = buildResultIndexArtifact(result, undefined, { + projectionIdentity, + duplicatePolicy: options.duplicatePolicy, + }); + const refs = artifactRefs(indexEntry); + const envelopeWire = safeEnvelope(toTraceEnvelopeWire(envelope), { includeRawContent }); + const projectionIdentityWire = envelopeWire.projection_identity; + if (!projectionIdentityWire) { + throw new Error(`Result ${result.testId ?? 'unknown'} is missing projection identity`); + } + const scores = safeScores(envelopeWire.scores, { includeRawContent }); + + const feedback: ProjectionBundleEntry['feedback'] = dropUndefined({ + source: 'agentv_grading_artifacts', + result_score: result.score, + execution_status: result.executionStatus, + grading_path: refs.grading_path, + timing_path: refs.timing_path, + assertion_count: result.assertions?.length ?? 0, + scores, + }); + + return { + projection_id: projectionIdentity.id, + projection_identity: projectionIdentityWire, + eval: envelopeWire.eval, + artifact_refs: refs, + trace: { + format: envelopeWire.trace.format, + trace_id: envelopeWire.trace.trace_id, + root_span_id: envelopeWire.trace.root_span_id, + span_count: envelopeWire.trace.spans.length, + envelope_ref: refs.trace_path, + }, + trace_envelope: envelopeWire, + feedback, + capture: envelopeWire.capture, + ...(envelopeWire.conversion_warnings + ? { conversion_warnings: envelopeWire.conversion_warnings } + : {}), + ...(includeRawContent ? { raw_content: rawContent(result) } : {}), + }; +} + +export function buildProjectionBundle( + results: readonly EvaluationResult[], + options: BuildProjectionBundleOptions, +): ProjectionBundle { + if (results.length === 0) { + throw new Error(`No results found in ${options.sourceFile}`); + } + + const entries = results.map((result) => buildEntry(result, options)); + const includeRawContent = options.includeRawContent ?? false; + const conversionWarnings = entries.flatMap((entry) => entry.conversion_warnings ?? []); + const bundleId = `projection-bundle-${shortHash([ + PROJECTION_BUNDLE_SCHEMA_VERSION, + toPortablePath(options.sourceFile, options.cwd), + options.runId, + includeRawContent ? 'raw' : 'metadata', + ...entries.map((entry) => entry.projection_id), + ])}`; + + return { + schema_version: PROJECTION_BUNDLE_SCHEMA_VERSION, + bundle_id: bundleId, + created_at: bundleCreatedAt(results), + source: { + kind: 'agentv_run', + path: toPortablePath(options.sourceFile, options.cwd), + run_id: options.runId, + result_count: results.length, + }, + content_policy: { + raw_content: includeRawContent ? 'included' : 'excluded', + raw_content_opt_in: includeRawContent, + default_capture: includeRawContent ? 'full' : 'metadata', + backend_anonymizer_boundary: 'adapter', + }, + capture_summary: entries[0]?.capture ?? { + content: includeRawContent ? 'full' : 'metadata', + redaction_level: includeRawContent ? 'none' : 'partial', + }, + entries, + ...(conversionWarnings.length > 0 ? { conversion_warnings: conversionWarnings } : {}), + }; +} + +export function serializeProjectionBundle(bundle: ProjectionBundle): string { + return `${JSON.stringify(bundle, null, 2)}\n`; +} + +export async function writeProjectionBundle( + bundle: ProjectionBundle, + outputDir: string, +): Promise { + const bundlePath = path.join(outputDir, PROJECTION_BUNDLE_FILENAME); + await mkdir(outputDir, { recursive: true }); + await writeFile(bundlePath, serializeProjectionBundle(bundle), 'utf8'); + return bundlePath; +} diff --git a/apps/cli/test/commands/results/export.test.ts b/apps/cli/test/commands/results/export.test.ts index c034089f7..ec4cc6940 100644 --- a/apps/cli/test/commands/results/export.test.ts +++ b/apps/cli/test/commands/results/export.test.ts @@ -9,12 +9,17 @@ import type { IndexArtifactEntry, TimingArtifact, } from '../../../src/commands/eval/artifact-writer.js'; +import { parseJsonlResults } from '../../../src/commands/eval/artifact-writer.js'; import { deriveExportRunId, deriveOutputDir, exportResults, loadExportSource, } from '../../../src/commands/results/export.js'; +import { + buildProjectionBundle, + serializeProjectionBundle, +} from '../../../src/commands/results/projection-bundle.js'; // ── Sample JSONL content (snake_case, matching on-disk format) ────────── @@ -96,6 +101,63 @@ const RESULT_NO_TRACE = { duration_ms: 500, }; +const RESULT_WITH_RAW_PAYLOADS = { + timestamp: '2026-03-18T10:00:20.000Z', + test_id: 'test-private', + suite: 'privacy', + score: 0.25, + assertions: [ + { + text: 'Avoids private content', + passed: false, + evidence: 'SECRET_ASSERTION_EVIDENCE', + }, + ], + output: 'SECRET_FINAL_OUTPUT', + target: 'codex', + input: [{ role: 'user', content: 'SECRET_PROMPT_TEXT' }], + scores: [ + { + name: 'privacy_review', + type: 'llm-grader', + score: 0.25, + assertions: [ + { + text: 'Avoids private content', + passed: false, + evidence: 'SECRET_SCORE_EVIDENCE', + }, + ], + details: { excerpt: 'SECRET_SCORE_DETAILS' }, + }, + ], + execution_status: 'quality_failure', + duration_ms: 900, + trace: { + messages: [ + { role: 'user', content: 'SECRET_PROMPT_TEXT' }, + { + role: 'assistant', + content: 'SECRET_FINAL_OUTPUT', + tool_calls: [ + { + id: 'tool-call-1', + tool: 'shell', + input: { command: 'cat SECRET_TOOL_ARGUMENTS' }, + output: 'SECRET_TOOL_RESULT', + status: 'ok', + }, + ], + }, + ], + events: [], + event_count: 2, + tool_calls: { shell: 1 }, + error_count: 0, + llm_call_count: 1, + }, +}; + function toJsonl(...records: object[]): string { return `${records.map((r) => JSON.stringify(r)).join('\n')}\n`; } @@ -184,6 +246,70 @@ describe('results export', () => { expect(deriveExportRunId(path.join(tempDir, 'legacy-results.jsonl'))).toBe('legacy-results'); }); + it('builds deterministic metadata-only projection bundle output for dry-run use', () => { + const sourceFile = path.join(tempDir, 'runs', 'privacy-run', 'index.jsonl'); + const [result] = parseJsonlResults(toJsonl(RESULT_WITH_RAW_PAYLOADS)); + + const first = buildProjectionBundle([result], { + sourceFile, + runId: 'privacy-run', + cwd: tempDir, + duplicatePolicy: 'update', + }); + const second = buildProjectionBundle([result], { + sourceFile, + runId: 'privacy-run', + cwd: tempDir, + duplicatePolicy: 'update', + }); + const serialized = serializeProjectionBundle(first); + + expect(serialized).toBe(serializeProjectionBundle(second)); + expect(first.content_policy).toMatchObject({ + raw_content: 'excluded', + raw_content_opt_in: false, + default_capture: 'metadata', + }); + expect(first.entries[0].projection_identity.dimensions.run_id).toBe('privacy-run'); + expect(first.entries[0].trace_envelope.trace.spans.length).toBeGreaterThan(0); + expect(first.entries[0].feedback.scores?.[0]).not.toHaveProperty('evidence'); + expect(serialized).not.toContain('SECRET_PROMPT_TEXT'); + expect(serialized).not.toContain('SECRET_FINAL_OUTPUT'); + expect(serialized).not.toContain('SECRET_TOOL_ARGUMENTS'); + expect(serialized).not.toContain('SECRET_TOOL_RESULT'); + expect(serialized).not.toContain('SECRET_SCORE_EVIDENCE'); + }); + + it('includes raw prompt, output, tool payloads, and score evidence only with opt-in', () => { + const sourceFile = path.join(tempDir, 'runs', 'privacy-run', 'index.jsonl'); + const [result] = parseJsonlResults(toJsonl(RESULT_WITH_RAW_PAYLOADS)); + + const bundle = buildProjectionBundle([result], { + sourceFile, + runId: 'privacy-run', + cwd: tempDir, + includeRawContent: true, + }); + const serialized = serializeProjectionBundle(bundle); + + expect(bundle.content_policy).toMatchObject({ + raw_content: 'included', + raw_content_opt_in: true, + default_capture: 'full', + }); + expect(bundle.entries[0].capture).toMatchObject({ + content: 'full', + redaction_level: 'none', + }); + expect(bundle.entries[0].raw_content).toBeDefined(); + expect(bundle.entries[0].feedback.scores?.[0]).toHaveProperty('evidence'); + expect(serialized).toContain('SECRET_PROMPT_TEXT'); + expect(serialized).toContain('SECRET_FINAL_OUTPUT'); + expect(serialized).toContain('SECRET_TOOL_ARGUMENTS'); + expect(serialized).toContain('SECRET_TOOL_RESULT'); + expect(serialized).toContain('SECRET_SCORE_EVIDENCE'); + }); + it('should create benchmark.json matching artifact-writer schema', async () => { const outputDir = path.join(tempDir, 'output'); const content = toJsonl(RESULT_FULL, RESULT_PARTIAL); diff --git a/apps/web/src/content/docs/docs/tools/results.mdx b/apps/web/src/content/docs/docs/tools/results.mdx index 4837595aa..8c888e4c7 100644 --- a/apps/web/src/content/docs/docs/tools/results.mdx +++ b/apps/web/src/content/docs/docs/tools/results.mdx @@ -108,6 +108,45 @@ Duplicate policy is explicit: `attempt` defaults to `0`, `variant` defaults to `null`, and `source_target` defaults to `target` when a run has no replay source. Replay and rerun sources can set `source_target`, `attempt`, or `variant`; those values are part of the identity, so different attempts, variants, or source targets produce distinct projection IDs. +### Vendor-neutral projection bundle + +Use the additive projection bundle path when an external adapter needs a +backend-neutral handoff instead of AgentV's full artifact tree: + +```bash +agentv results export --projection-bundle +``` + +This writes `projection_bundle.json` next to the exported artifacts. The bundle +contains stable projection IDs, trace envelope metadata, OpenInference-shaped +span references, score provenance, artifact-relative paths, capture/redaction +summary, and conversion warnings. It does not call Phoenix, Opik, Braintrust, +Langfuse, Hugging Face, or any other live service. + +For adapter development and CI snapshots, use dry-run mode: + +```bash +agentv results export --dry-run > projection_bundle.json +``` + +Dry-run prints deterministic JSON and does not write export artifacts. Vendor +adapters should consume either this JSON directly or the local +`projection_bundle.json` plus the referenced files such as `grading.json`, +`timing.json`, and `outputs/trace.json`. + +Raw prompt text, final output, and tool arguments/results are excluded by +default. To include them in the bundle, opt in explicitly: + +```bash +agentv results export --dry-run --include-raw-content +``` + +Keep backend-specific anonymization in the adapter layer. For example, an Opik +adapter can read the metadata-only bundle by default, or require +`--include-raw-content` and then run Opik anonymizers before upload. AgentV does +not run a custom redaction engine in `results export`; it records the capture +policy so downstream processing is auditable. + ## Inspection helpers For lightweight terminal workflows: From 79e22b4bb7c2b9c6b693e4723b3e6d1839623f84 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 21 Jun 2026 05:29:45 +0200 Subject: [PATCH 07/21] feat(artifacts): preserve raw provider logs beside transcripts --- apps/cli/src/commands/eval/artifact-writer.ts | 1 + apps/cli/src/commands/eval/run-eval.ts | 4 +- .../commands/eval/artifact-writer.test.ts | 66 +++++++++++++++++++ .../docs/docs/evaluation/running-evals.mdx | 12 +++- .../src/content/docs/docs/tools/import.mdx | 5 +- packages/core/src/evaluation/orchestrator.ts | 21 ++++++ .../core/src/evaluation/result-row-schema.ts | 1 + packages/core/src/evaluation/run-artifacts.ts | 59 ++++++++++++++++- packages/core/src/evaluation/types.ts | 6 ++ .../loaders/agent-skills-parser.test.ts | 18 +++++ 10 files changed, 186 insertions(+), 7 deletions(-) diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts index 859f1ff4b..7798fc554 100644 --- a/apps/cli/src/commands/eval/artifact-writer.ts +++ b/apps/cli/src/commands/eval/artifact-writer.ts @@ -94,6 +94,7 @@ export function buildIndexArtifactEntry( outputPath?: string; answerPath?: string; transcriptPath?: string; + rawProviderLogPath?: string; inputPath?: string; responsePath?: string; taskBundle?: MaterializedTaskBundlePaths; diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 43637c598..154a0f5cc 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -1044,8 +1044,10 @@ async function runSingleEvalFile(params: { // Full output with tool calls goes to OTel. const resultWithMetadata = withSourceMetadata(result, testFilePath, options); const trimmedOutput = trimOutputMessages(resultWithMetadata.output, options.outputMessages); + const serializableResult = { ...resultWithMetadata }; + serializableResult.rawProviderLogPath = undefined; const trimmedResult: EvaluationResult = { - ...resultWithMetadata, + ...serializableResult, output: trimmedOutput, }; await outputWriter.append(trimmedResult); diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts index 14ab4dfdd..1fde39ff9 100644 --- a/apps/cli/test/commands/eval/artifact-writer.test.ts +++ b/apps/cli/test/commands/eval/artifact-writer.test.ts @@ -587,6 +587,21 @@ describe('parseJsonlResults', () => { expect(results[0].trace.toolCalls).toEqual({ rg: 1 }); }); + it('does not treat parsed raw provider log pointers as fresh source artifacts', () => { + const content = `${JSON.stringify({ + test_id: 'raw-log-case', + target: 'codex', + score: 1, + output: 'done', + raw_provider_log_path: 'raw-log-case/outputs/raw/provider.log', + })}\n`; + + const results = parseJsonlResults(content); + + expect(results).toHaveLength(1); + expect(results[0].rawProviderLogPath).toBeUndefined(); + }); + it('handles empty content', () => { expect(parseJsonlResults('')).toHaveLength(0); }); @@ -919,6 +934,9 @@ describe('writeArtifactsFromResults', () => { 'chat', 'execute_tool', ]); + await expect( + readFile(path.join(testDir, 'transcript-case', 'outputs', 'transcript.json'), 'utf8'), + ).rejects.toThrow(); const indexLine = JSON.parse( (await readFile(path.join(testDir, 'index.jsonl'), 'utf8')).trim(), @@ -927,6 +945,54 @@ describe('writeArtifactsFromResults', () => { expect(indexLine).not.toHaveProperty('trace_path'); }); + it('copies optional raw provider logs as non-canonical evidence', async () => { + const rawLogPath = path.join(testDir, 'provider-source.log'); + const rawLog = [ + '# provider-native stream log', + '{"time":"00:00","data":{"camelCaseProviderKey":true,"toolInput":{"filePath":"src/index.ts"}}}', + '', + ].join('\n'); + await mkdir(testDir, { recursive: true }); + await writeFile(rawLogPath, rawLog, 'utf8'); + + const results = [ + makeResult({ + testId: 'raw-log-case', + target: 'codex', + output: 'Raw log copied', + rawProviderLogPath: rawLogPath, + }), + ]; + + await writeArtifactsFromResults(results, testDir); + + const copiedRawLogPath = path.join(testDir, 'raw-log-case', 'outputs', 'raw', 'provider.log'); + expect(await readFile(copiedRawLogPath, 'utf8')).toBe(rawLog); + + const transcriptPath = path.join(testDir, 'raw-log-case', 'outputs', 'transcript.jsonl'); + await expect(readFile(transcriptPath, 'utf8')).resolves.toContain( + '"schema_version":"agentv.transcript.v1"', + ); + await expect( + readFile(path.join(testDir, 'raw-log-case', 'outputs', 'transcript.json'), 'utf8'), + ).rejects.toThrow(); + + const envelope = TraceEnvelopeWireSchema.parse( + JSON.parse( + await readFile(path.join(testDir, 'raw-log-case', 'outputs', 'trace.json'), 'utf8'), + ), + ); + expect(envelope.artifacts.raw_provider_log_path).toBe('outputs/raw/provider.log'); + expect(envelope.artifacts.transcript_path).toBe('outputs/transcript.jsonl'); + + const indexLine = JSON.parse( + (await readFile(path.join(testDir, 'index.jsonl'), 'utf8')).trim(), + ); + expect(indexLine.raw_provider_log_path).toBe('raw-log-case/outputs/raw/provider.log'); + expect(indexLine.transcript_path).toBe('raw-log-case/outputs/transcript.jsonl'); + expect(indexLine).not.toHaveProperty('transcript_json_path'); + }); + it('omits per-test transcript links when the execution trace has no transcript rows', async () => { const results = [ makeResult({ diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx index e3fc7803e..09b2cee6e 100644 --- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx +++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx @@ -450,11 +450,19 @@ timing, token usage, cost, source metadata, capture state, and trace pointers. Provider-native payloads can appear only inside opaque nested fields such as `metadata`, `source.metadata`, tool `input`, or tool `output`. +When an agent provider captures a native stream or session log, the result row +may also include `raw_provider_log_path`, pointing at +`outputs/raw/provider.log`. That file is raw evidence copied byte-for-byte from +the provider log and is not parsed, normalized, or required for replay, import, +Agent Skills conversion, or grading. AgentV does not write or maintain a +parallel `outputs/transcript.json` source of truth. + Use the transcript when you need a compact portable message/event projection over the trace, including exports to role/content arrays for chat-template or Hugging Face-style workflows. Use the trace when you need full lifecycle, span, -raw evidence, redaction, or adapter conversion details. The transcript is not a -second canonical trace source and is not a provider-native Pi session dump. +raw evidence pointers, redaction, or adapter conversion details. The transcript +is not a second canonical trace source and is not a provider-native Pi session +dump. Older transcript rows without `schema_version`, `capture`, or `trace` remain accepted for replay. diff --git a/apps/web/src/content/docs/docs/tools/import.mdx b/apps/web/src/content/docs/docs/tools/import.mdx index 54794e6ec..86ff592aa 100644 --- a/apps/web/src/content/docs/docs/tools/import.mdx +++ b/apps/web/src/content/docs/docs/tools/import.mdx @@ -176,7 +176,10 @@ Rows without `schema_version`, `capture`, or `trace` from older AgentV transcrip exports remain replayable. New eval run artifacts write the v1 shape. For eval run artifacts, `outputs/transcript.jsonl` is derived from `outputs/trace.json`; it is a portable message/event projection, not a second -canonical trace source or a provider-native session dump. +canonical trace source or a provider-native session dump. Provider-native +session or stream logs, when captured during an eval run, are separate raw +evidence artifacts referenced by `raw_provider_log_path`; Agent Skills import, +convert, transpile, and run paths do not require them. ## What Gets Parsed diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 41ee9610e..cc0e460ed 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -129,6 +129,21 @@ function usesFileReferencePrompt(provider: Provider): boolean { return isAgentProvider(provider) || provider.kind === 'cli'; } +function extractProviderRawLogPath(response: ProviderResponse): string | undefined { + const raw = response.raw; + if (!raw || typeof raw !== 'object' || Array.isArray(raw)) { + return undefined; + } + + const logFile = (raw as Record).logFile; + if (typeof logFile !== 'string') { + return undefined; + } + + const trimmed = logFile.trim(); + return trimmed.length > 0 ? trimmed : undefined; +} + interface EvaluationRuntimeOptions { readonly target: ResolvedTarget; readonly targets?: readonly TargetDefinition[]; @@ -1588,6 +1603,7 @@ async function runBatchEvaluation(options: { const tokenUsage = merged?.tokenUsage; const startTime = merged?.startTime; const endTime = merged?.endTime; + const rawProviderLogPath = extractProviderRawLogPath(providerResponse); // Extract candidate from last assistant message in output const candidate = extractLastAssistantContent(output); @@ -1615,6 +1631,7 @@ async function runBatchEvaluation(options: { tokenUsage, startTime, endTime, + rawProviderLogPath, targetResolver, availableTargets, verbose, @@ -1982,6 +1999,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise Provider | undefined; readonly availableTargets?: readonly string[]; readonly fileChanges?: string; @@ -2404,6 +2423,7 @@ async function evaluateCandidate(options: { tokenUsage, startTime, endTime, + rawProviderLogPath, targetResolver, availableTargets, fileChanges, @@ -2514,6 +2534,7 @@ async function evaluateCandidate(options: { output: candidate, scores: scores, trace: evaluationTrace, + rawProviderLogPath, fileChanges, executionStatus: classifyQualityStatus(score.score, evalThreshold), }; diff --git a/packages/core/src/evaluation/result-row-schema.ts b/packages/core/src/evaluation/result-row-schema.ts index fff7ef0e0..7c48c97d2 100644 --- a/packages/core/src/evaluation/result-row-schema.ts +++ b/packages/core/src/evaluation/result-row-schema.ts @@ -34,6 +34,7 @@ const RESULT_ROW_ALIASES = { gradingPath: 'grading_path', inputPath: 'input_path', outputPath: 'output_path', + rawProviderLogPath: 'raw_provider_log_path', responsePath: 'response_path', startTime: 'start_time', targetsPath: 'targets_path', diff --git a/packages/core/src/evaluation/run-artifacts.ts b/packages/core/src/evaluation/run-artifacts.ts index c15fb205d..498878818 100644 --- a/packages/core/src/evaluation/run-artifacts.ts +++ b/packages/core/src/evaluation/run-artifacts.ts @@ -7,7 +7,7 @@ * snake_case here so every caller produces the same artifacts. */ -import { mkdir, readFile, writeFile } from 'node:fs/promises'; +import { copyFile, mkdir, readFile, writeFile } from 'node:fs/promises'; import path from 'node:path'; import { traceEnvelopeToTranscriptJsonLines } from '../import/types.js'; @@ -205,6 +205,7 @@ export interface IndexArtifactEntry { readonly output_path?: string; readonly answer_path?: string; readonly transcript_path?: string; + readonly raw_provider_log_path?: string; readonly input_path?: string; readonly response_path?: string; readonly task_dir?: string; @@ -226,7 +227,12 @@ export type ResultIndexArtifact = IndexArtifactEntry; export type AdditionalResultIndexFields = Partial< Pick< IndexArtifactEntry, - 'task_dir' | 'eval_path' | 'targets_path' | 'files_path' | 'graders_path' + | 'task_dir' + | 'eval_path' + | 'targets_path' + | 'files_path' + | 'graders_path' + | 'raw_provider_log_path' > >; @@ -732,6 +738,26 @@ function resultHasExecutionTraceTranscript(result: EvaluationResult): boolean { return result.output.length > 0 || result.trace.messages.length > 0; } +function rawProviderLogSourcePath(result: EvaluationResult): string | undefined { + const sourcePath = result.rawProviderLogPath?.trim(); + return sourcePath ? sourcePath : undefined; +} + +function rawProviderLogArtifactPath(outputsDir: string): string { + return path.join(outputsDir, 'raw', 'provider.log'); +} + +async function copyRawProviderLogArtifact(sourcePath: string, outputsDir: string): Promise { + const destinationPath = rawProviderLogArtifactPath(outputsDir); + if (path.resolve(sourcePath) === path.resolve(destinationPath)) { + return destinationPath; + } + + await mkdir(path.dirname(destinationPath), { recursive: true }); + await copyFile(sourcePath, destinationPath); + return destinationPath; +} + interface TraceEnvelopeSidecarParams { readonly result: EvaluationResult; readonly outputDir: string; @@ -755,6 +781,9 @@ function buildTraceEnvelopeSidecar(params: TraceEnvelopeSidecarParams): TraceEnv answer_path: params.result.output.length > 0 ? 'outputs/answer.md' : undefined, response_path: params.result.output.length > 0 ? 'outputs/response.md' : undefined, transcript_path: hasTranscript ? 'outputs/transcript.jsonl' : undefined, + raw_provider_log_path: rawProviderLogSourcePath(params.result) + ? 'outputs/raw/provider.log' + : undefined, }, duplicatePolicy: params.duplicatePolicy, }); @@ -782,6 +811,7 @@ export function buildIndexArtifactEntry( outputPath?: string; answerPath?: string; transcriptPath?: string; + rawProviderLogPath?: string; inputPath?: string; responsePath?: string; extraIndexFields?: AdditionalResultIndexFields; @@ -822,6 +852,9 @@ export function buildIndexArtifactEntry( transcript_path: options.transcriptPath ? toRelativeArtifactPath(options.outputDir, options.transcriptPath) : undefined, + raw_provider_log_path: options.rawProviderLogPath + ? toRelativeArtifactPath(options.outputDir, options.rawProviderLogPath) + : undefined, input_path: options.inputPath ? toRelativeArtifactPath(options.outputDir, options.inputPath) : undefined, @@ -849,6 +882,7 @@ export function buildResultIndexArtifact( const input = extractInput(result); const hasAnswer = result.output.length > 0; const hasTranscript = resultHasExecutionTraceTranscript(result); + const hasRawProviderLog = rawProviderLogSourcePath(result) !== undefined; return { timestamp: result.timestamp, @@ -878,6 +912,9 @@ export function buildResultIndexArtifact( transcript_path: hasTranscript ? path.posix.join(artifactSubdir, 'outputs', 'transcript.jsonl') : undefined, + raw_provider_log_path: hasRawProviderLog + ? path.posix.join(artifactSubdir, 'outputs', 'raw', 'provider.log') + : undefined, response_path: hasAnswer ? path.posix.join(artifactSubdir, 'outputs', 'response.md') : undefined, @@ -1122,6 +1159,8 @@ function normalizeParsedResult(value: unknown): ParsedEvaluationResult | undefin } const result = value as Record; + const parsedResult = { ...result }; + parsedResult.rawProviderLogPath = undefined; const legacyOutputMessages = Array.isArray(result.output) ? result.output.filter(isOutputMessage) : undefined; @@ -1148,7 +1187,7 @@ function normalizeParsedResult(value: unknown): ParsedEvaluationResult | undefin }); return { - ...result, + ...parsedResult, timestamp: typeof result.timestamp === 'string' ? result.timestamp : new Date(0).toISOString(), testId: typeof result.testId === 'string' ? result.testId : 'unknown', score: typeof result.score === 'number' ? result.score : 0, @@ -1263,6 +1302,10 @@ export async function writePerTestArtifacts( await writeFile(path.join(outputsDir, 'answer.md'), result.output, 'utf8'); await writeFile(path.join(outputsDir, 'response.md'), result.output, 'utf8'); } + const rawProviderLogSource = rawProviderLogSourcePath(result); + if (rawProviderLogSource) { + await copyRawProviderLogArtifact(rawProviderLogSource, outputsDir); + } const envelope = await writeTraceEnvelopeSidecar({ result, outputDir, @@ -1351,6 +1394,10 @@ export async function writeArtifactsFromResults( const transcriptPath = hasTranscriptProjection(result, envelope) ? path.join(outputsDir, 'transcript.jsonl') : undefined; + const rawProviderLogSource = rawProviderLogSourcePath(result); + const rawProviderLogPath = rawProviderLogSource + ? rawProviderLogArtifactPath(outputsDir) + : undefined; const projectionIdentity = envelope.projectionIdentity; if (!projectionIdentity) { throw new Error(`Result ${result.testId ?? 'unknown'} is missing projection identity`); @@ -1371,6 +1418,8 @@ export async function writeArtifactsFromResults( envelope, projectionIdentity, transcriptPath, + rawProviderLogSource, + rawProviderLogPath, identityId, }; }); @@ -1416,6 +1465,9 @@ export async function writeArtifactsFromResults( await writeFile(plan.answerPath, result.output, 'utf8'); await writeFile(plan.responsePath, result.output, 'utf8'); } + if (plan.rawProviderLogSource) { + await copyRawProviderLogArtifact(plan.rawProviderLogSource, plan.outputsDir); + } await writeFile( path.join(plan.outputsDir, 'trace.json'), `${JSON.stringify(toTraceEnvelopeWire(envelope), null, 2)}\n`, @@ -1442,6 +1494,7 @@ export async function writeArtifactsFromResults( outputPath: plan.answerPath, answerPath: plan.answerPath, transcriptPath: plan.transcriptPath, + rawProviderLogPath: plan.rawProviderLogPath, inputPath: plan.inputPath, responsePath: plan.responsePath, extraIndexFields, diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index 5d1e139ca..01b63b398 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -1184,6 +1184,12 @@ export interface EvaluationResult { readonly error?: string; /** Canonical execution trace: messages, events, metrics, and provider provenance. */ readonly trace: Trace; + /** + * Optional local provider-native session/stream log captured by a provider. + * Artifact writers copy this byte-for-byte into the run bundle as raw, + * non-canonical evidence and expose only the run-local pointer. + */ + readonly rawProviderLogPath?: string; /** Path to the temporary workspace directory (included on failure for debugging) */ readonly workspacePath?: string; /** Input messages sent to the agent. Always Message[] for consistent shape with output. */ diff --git a/packages/core/test/evaluation/loaders/agent-skills-parser.test.ts b/packages/core/test/evaluation/loaders/agent-skills-parser.test.ts index 166542fc5..40183638e 100644 --- a/packages/core/test/evaluation/loaders/agent-skills-parser.test.ts +++ b/packages/core/test/evaluation/loaders/agent-skills-parser.test.ts @@ -218,6 +218,24 @@ describe('parseAgentSkillsEvals', () => { expect(tests[0].metadata).toBeUndefined(); }); + it('ignores transcript artifact-looking fields in evals.json cases', () => { + const tests = parseAgentSkillsEvals({ + evals: [ + { + id: 1, + prompt: 'test prompt', + transcript_path: 'outputs/transcript.jsonl', + raw_provider_log_path: 'outputs/raw/provider.log', + }, + ], + }); + + expect(tests).toHaveLength(1); + expect(tests[0].metadata).toBeUndefined(); + expect(tests[0]).not.toHaveProperty('transcript_path'); + expect(tests[0]).not.toHaveProperty('raw_provider_log_path'); + }); + it('includes source in error messages', () => { expect(() => parseAgentSkillsEvals({}, 'my-evals.json')).toThrow('my-evals.json'); }); From f02bcae0e4f5721aed964135c460cf707f19e770 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 21 Jun 2026 06:07:12 +0200 Subject: [PATCH 08/21] docs(results): avoid ref prefix conflicts Rename the planned artifact and oplog sidecar refs to sibling namespaces so they can coexist with agentv/results/v1. --- ...21-001-feat-av-quf-results-storage-plan.md | 69 +++++++++++-------- 1 file changed, 39 insertions(+), 30 deletions(-) diff --git a/docs/plans/2026-06-21-001-feat-av-quf-results-storage-plan.md b/docs/plans/2026-06-21-001-feat-av-quf-results-storage-plan.md index 71a1d7253..e5dcf346b 100644 --- a/docs/plans/2026-06-21-001-feat-av-quf-results-storage-plan.md +++ b/docs/plans/2026-06-21-001-feat-av-quf-results-storage-plan.md @@ -45,7 +45,7 @@ without creating another hosted results platform inside AgentV. - Define storage backend modes and per-mode listing/index strategies. - Pin the git-native ref and path layout for `agentv/results/v1`, - `agentv/results/v1/artifacts`, and `agentv/results/v1/oplog`. + `agentv/artifacts/v1`, and `agentv/oplog/v1`. - Define retention, compaction, and migration rules for run metadata and heavy artifacts. - Define compact publication export as a derived artifact over `benchmark.json` and `index.jsonl`, with no required `eval.txt`. @@ -88,9 +88,9 @@ without creating another hosted results platform inside AgentV. - R5. The primary results ref is `agentv/results/v1`. - R6. Heavy artifact sidecars use the single artifact ref or namespace - `agentv/results/v1/artifacts`, with path prefixes such as `transcripts/`, - `raw-logs/`, and `screenshots/`. -- R7. Mutable operations use the single oplog ref or namespace `agentv/results/v1/oplog`. + `agentv/artifacts/v1`, with path prefixes such as `transcripts/`, `raw-logs/`, + and `screenshots/`. +- R7. Mutable operations use the single oplog ref or namespace `agentv/oplog/v1`. - R8. The git-native branch must keep deterministic orphan genesis and must not create windowed branches or per-run branches. - R9. Path sharding is not part of v1 unless measurement at realistic scale proves it is @@ -103,7 +103,7 @@ without creating another hosted results platform inside AgentV. - R11. Hybrid and blob-native modes must support object lifecycle policy alignment for artifact payloads without deleting index metadata prematurely. - R12. Transcript migration must support transcripts under - `agentv/results/v1/artifacts` while preserving existing logical artifact references. + `agentv/artifacts/v1` while preserving existing logical artifact references. - R13. Publication export must be compact and derived from `benchmark.json` plus `index.jsonl`; it must not require an authored or generated `eval.txt`. @@ -140,17 +140,20 @@ without creating another hosted results platform inside AgentV. `artifact-blobs`, `blobs`, or per-artifact refs. Prefix by artifact class, for example `transcripts//...`, `raw-logs//...`, and `screenshots//...`. -- KTD5. Hybrid mode keeps git as the metadata and index authority, while object storage +- KTD5. Use sibling Git refs for results, artifacts, and oplog. Git refs are stored + path-like, so `agentv/results/v1` cannot coexist with child refs such as + `agentv/results/v1/artifacts` or `agentv/results/v1/oplog`. +- KTD6. Hybrid mode keeps git as the metadata and index authority, while object storage stores selected heavy payload bytes. Git contains stable artifact locator records with checksums, sizes, and logical paths so readers can verify fetched payloads. -- KTD6. Blob-native mode mirrors the same logical namespaces in the bucket, but does not +- KTD7. Blob-native mode mirrors the same logical namespaces in the bucket, but does not emulate git refs. It owns bucket manifests and per-prefix object listings. -- KTD7. Mutable operations are derived overlays. Existing `metadata/runs/**/tags.json` +- KTD8. Mutable operations are derived overlays. Existing `metadata/runs/**/tags.json` is a compatibility read/write surface until oplog materialization replaces direct overlay writes. -- KTD8. Publication export is a projection. It should read completed run bundles and +- KTD9. Publication export is a projection. It should read completed run bundles and emit a compact publishable directory without becoming a new source of truth. -- KTD9. Backblaze B2 is addressed only through S3-compatible endpoints and Signature V4. +- KTD10. Backblaze B2 is addressed only through S3-compatible endpoints and Signature V4. The object client should be a standard S3 client configured with endpoint, region, bucket, and credentials. @@ -164,8 +167,8 @@ without creating another hosted results platform inside AgentV. flowchart TB Local[Local run workspace .agentv/results/runs] --> Publish[Result publisher] Publish --> GitIndex[agentv/results/v1 runs metadata] - Publish --> GitArtifacts[agentv/results/v1/artifacts artifact sidecar] - Publish --> Oplog[agentv/results/v1/oplog mutable ops] + Publish --> GitArtifacts[agentv/artifacts/v1 artifact sidecar] + Publish --> Oplog[agentv/oplog/v1 mutable ops] Publish --> Bucket[(B2 S3-compatible bucket)] GitIndex --> Dashboard[Dashboard and CLI readers] @@ -182,8 +185,8 @@ flowchart TB | Mode | Canonical index/listing | Artifact payloads | Mutable ops | Git dependency | | --- | --- | --- | --- | --- | -| `git-native` | `git ls-tree -r agentv/results/v1 -- runs/` plus `git cat-file --batch` for `benchmark.json` | `agentv/results/v1/artifacts` stores payload bytes | `agentv/results/v1/oplog` | Required | -| `hybrid` | Same primary git ref as `git-native` | Object storage stores selected payload bytes; git stores locators under the artifact namespace | `agentv/results/v1/oplog` | Required for index/oplog | +| `git-native` | `git ls-tree -r agentv/results/v1 -- runs/` plus `git cat-file --batch` for `benchmark.json` | `agentv/artifacts/v1` stores payload bytes | `agentv/oplog/v1` | Required | +| `hybrid` | Same primary git ref as `git-native` | Object storage stores selected payload bytes; git stores locators under the artifact namespace | `agentv/oplog/v1` | Required for index/oplog | | `blob-native` | Bucket manifest under the results namespace, with `ListObjectsV2` fallback by prefix | Object storage stores all payloads | Bucket oplog prefix | None | ### Logical Namespace Shape @@ -195,12 +198,12 @@ agentv/results/v1 runs/// metadata/runs///materialized-tags.json -agentv/results/v1/artifacts +agentv/artifacts/v1 transcripts////transcript.jsonl raw-logs////.jsonl screenshots////.png -agentv/results/v1/oplog +agentv/oplog/v1 actors//-.json ``` @@ -292,7 +295,9 @@ ref. Do not add windowed or per-run branches. Do not shard paths before measurem - `packages/core/src/evaluation/results-repo.ts` - Keep `DEFAULT_RESULTS_BRANCH = 'agentv/results/v1'`. - Add constants for the artifact and oplog refs: - `agentv/results/v1/artifacts` and `agentv/results/v1/oplog`. + `agentv/artifacts/v1` and `agentv/oplog/v1`. + - Add a shared test assertion that all three refs pass `git check-ref-format` + and no ref is a prefix parent or child of another. - Extend safe-path staging to include only owned top-level paths on each ref. - Keep `createResultsGenesisCommit()` and `createOrphanResultsBranch()` behavior for any new git storage refs so independent clients converge on the same root. @@ -305,7 +310,7 @@ ref. Do not add windowed or per-run branches. Do not shard paths before measurem - `packages/core/test/evaluation/results-repo.test.ts` - Add deterministic genesis tests for the artifact and oplog refs if they are created by separate helper functions. - - Add tests that two clients publishing to `agentv/results/v1/artifacts` converge + - Add tests that two clients publishing to `agentv/artifacts/v1` converge rather than minting divergent orphan roots. **Layout rules:** @@ -313,11 +318,11 @@ ref. Do not add windowed or per-run branches. Do not shard paths before measurem - Primary ref `agentv/results/v1`: - Owns `runs/**` and lightweight materialized metadata. - Lists runs only through `runs/**/benchmark.json`. -- Artifact ref `agentv/results/v1/artifacts`: +- Artifact ref `agentv/artifacts/v1`: - Owns payload classes under `transcripts/`, `raw-logs/`, and `screenshots/`. - May store payload bytes in `git-native`. - May store locator manifests in `hybrid`. -- Oplog ref `agentv/results/v1/oplog`: +- Oplog ref `agentv/oplog/v1`: - Owns append-only operation records under `actors/**`. - Is never used for immutable run payloads. @@ -326,8 +331,10 @@ ref. Do not add windowed or per-run branches. Do not shard paths before measurem - Unit test constants and normalized default branch. - Integration test with a temporary repo that publishes: - one run to `agentv/results/v1`; - - one transcript payload to `agentv/results/v1/artifacts`; - - one tag operation to `agentv/results/v1/oplog`. + - one transcript payload to `agentv/artifacts/v1`; + - one tag operation to `agentv/oplog/v1`. +- Assert all three refs can coexist in one temporary repo because none is a + path-prefix of another. - Assert the source checkout branch does not switch. - Assert no `agentv/results/v1/` or `agentv/results/run/` refs are created. @@ -362,7 +369,7 @@ after logical deletion. existing path fields. - `apps/cli/src/commands/results/remote.ts` - Teach `ensureRemoteRunAvailable()` and future artifact resolvers to fetch a - transcript from `agentv/results/v1/artifacts` when the run-local path is a logical + transcript from `agentv/artifacts/v1` when the run-local path is a logical reference. - `apps/cli/src/commands/results/serve.ts` - Keep file API responses stable for transcript JSONL, whether bytes are local, @@ -372,7 +379,7 @@ after logical deletion. - Logical prune commit: - Removes selected `runs///**` from `agentv/results/v1`. - - Removes selected artifact paths from `agentv/results/v1/artifacts` or replaces + - Removes selected artifact paths from `agentv/artifacts/v1` or replaces hybrid locator records with tombstones. - Appends retention operations to oplog when mutable state is affected. - Compaction: @@ -398,7 +405,7 @@ after logical deletion. - Existing runs may have `transcript_path` pointing at `/outputs/transcript.jsonl`. - Migration copies transcript bytes to - `agentv/results/v1/artifacts:transcripts////transcript.jsonl` + `agentv/artifacts/v1:transcripts////transcript.jsonl` or the matching object-store key. - `index.jsonl` keeps `transcript_path` as the logical path and gains optional locator metadata with `backend`, `ref` or bucket namespace, `path`, `sha256`, and @@ -422,7 +429,7 @@ after logical deletion. **Acceptance:** - Retention can remove old live runs without breaking listing for retained runs. -- A transcript migrated under `agentv/results/v1/artifacts` remains viewable through +- A transcript migrated under `agentv/artifacts/v1` remains viewable through the existing Dashboard file API. - Compaction cannot run implicitly as a side effect of publish, sync, or Dashboard polling. @@ -498,7 +505,7 @@ Tags are the first materialized view and use add-wins set semantics. payload, created timestamp, and optional causal metadata. - Implement add-wins tag projection. - `packages/core/src/evaluation/results-repo.ts` - - Add git append helpers for `agentv/results/v1/oplog`. + - Add git append helpers for `agentv/oplog/v1`. - `apps/cli/src/commands/results/serve.ts` - Route tag set, clear, and read endpoints through oplog projection for remote runs once the adapter is available. @@ -526,7 +533,7 @@ actor's later tag addition. **Where oplog lives by mode:** -- `git-native`: `agentv/results/v1/oplog` git ref, under +- `git-native`: `agentv/oplog/v1` git ref, under `actors//-.json`. - `hybrid`: same git oplog ref, because git remains the metadata authority. - `blob-native`: object-store prefix @@ -838,8 +845,10 @@ results: - [ ] Spec includes one section each for storage modes, git-native layout, retention/compaction, publication export, oplog, and object storage. -- [ ] All refs are pinned exactly: `agentv/results/v1`, - `agentv/results/v1/artifacts`, and `agentv/results/v1/oplog`. +- [ ] All refs are pinned exactly: `agentv/results/v1`, `agentv/artifacts/v1`, + and `agentv/oplog/v1`. +- [ ] Shared ref tests assert the three refs are valid Git refnames and cannot + prefix-conflict. - [ ] The artifact sidecar is called `artifacts`, not `artifact-blobs` or `blob`. - [ ] The plan has no windowed or per-run branches. - [ ] Path sharding is deferred until realistic measurement proves need. From 61d4ede76aed875362347f18a5c686255ab98151 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 21 Jun 2026 06:07:29 +0200 Subject: [PATCH 09/21] fix(results): avoid ref prefix conflicts --- apps/cli/test/commands/results/shared.test.ts | 2 +- .../src/content/docs/docs/tools/results.mdx | 2 +- .../src/evaluation/result-artifact-contract.ts | 11 ++++++++--- .../core/test/evaluation/results-repo.test.ts | 18 +++++++++++++++--- 4 files changed, 25 insertions(+), 8 deletions(-) diff --git a/apps/cli/test/commands/results/shared.test.ts b/apps/cli/test/commands/results/shared.test.ts index e9d2f33a0..5e701116e 100644 --- a/apps/cli/test/commands/results/shared.test.ts +++ b/apps/cli/test/commands/results/shared.test.ts @@ -107,7 +107,7 @@ describe('results shared source resolution', () => { timing_path: 'pointer-case/timing.json', artifact_pointers: { transcript: { - ref: 'agentv/results/v1/artifacts', + ref: 'agentv/artifacts/v1', key: 'transcripts/pointer-case/outputs/transcript.jsonl', object_version: 'sha256:test', path: transcriptRelativePath, diff --git a/apps/web/src/content/docs/docs/tools/results.mdx b/apps/web/src/content/docs/docs/tools/results.mdx index 50254dfc8..be018c780 100644 --- a/apps/web/src/content/docs/docs/tools/results.mdx +++ b/apps/web/src/content/docs/docs/tools/results.mdx @@ -127,7 +127,7 @@ The CLI contract is deliberately narrow: `agentv results` manages local result a Use these supported remote workflows instead: -- **Automatic publishing:** configure `projects[].results` or top-level `results`; new `agentv eval` and `agentv pipeline bench` runs publish completed artifacts after the run completes. Use `repo_path: .` with `branch: agentv/results/v1` to store primary result records on a dedicated branch of the source repo. AgentV reserves `agentv/results/v1` for primary results, `agentv/results/v1/artifacts` for heavy artifact payloads, and `agentv/results/v1/oplog` for mutable run/result operations. `repo_path` means an existing local Git checkout, distinct from `workspace.repos[].repo`, which is a portable repository identity. Set `sync.auto_push: true` to push after publish, or `sync.require_push: true` in CI to fail when that push fails. While an eval is still running, [WIP checkpoints](/docs/tools/wip-checkpoints/) can keep partial run output durable on `agentv/wip/...` branches when auto-push is enabled. +- **Automatic publishing:** configure `projects[].results` or top-level `results`; new `agentv eval` and `agentv pipeline bench` runs publish completed artifacts after the run completes. Use `repo_path: .` with `branch: agentv/results/v1` to store primary result records on a dedicated branch of the source repo. AgentV reserves `agentv/results/v1` for primary results, `agentv/artifacts/v1` for heavy artifact payloads, and `agentv/oplog/v1` for mutable run/result operations. `repo_path` means an existing local Git checkout, distinct from `workspace.repos[].repo`, which is a portable repository identity. Set `sync.auto_push: true` to push after publish, or `sync.require_push: true` in CI to fail when that push fails. While an eval is still running, [WIP checkpoints](/docs/tools/wip-checkpoints/) can keep partial run output durable on `agentv/wip/...` branches when auto-push is enabled. - **Manual Dashboard sync:** run `agentv dashboard`, open the project, and use **Sync Project**. - **Manual API sync:** while Dashboard is running, call `GET /api/projects/:projectId/remote/status` or `POST /api/projects/:projectId/remote/sync` for project-scoped automation. Single-project sessions also expose `GET /api/remote/status` and `POST /api/remote/sync`. - **Git escape hatch:** for advanced recovery, inspect or repair the configured `projects[].results.path` clone with `git` directly, then sync again. diff --git a/packages/core/src/evaluation/result-artifact-contract.ts b/packages/core/src/evaluation/result-artifact-contract.ts index f91c1f8bb..a8f122bf5 100644 --- a/packages/core/src/evaluation/result-artifact-contract.ts +++ b/packages/core/src/evaluation/result-artifact-contract.ts @@ -9,8 +9,8 @@ */ export const AGENTV_RESULTS_PRIMARY_REF = 'agentv/results/v1' as const; -export const AGENTV_RESULTS_ARTIFACTS_REF = 'agentv/results/v1/artifacts' as const; -export const AGENTV_RESULTS_OPLOG_REF = 'agentv/results/v1/oplog' as const; +export const AGENTV_RESULTS_ARTIFACTS_REF = 'agentv/artifacts/v1' as const; +export const AGENTV_RESULTS_OPLOG_REF = 'agentv/oplog/v1' as const; export const AGENTV_RESULTS_REFS = { primary: AGENTV_RESULTS_PRIMARY_REF, @@ -27,7 +27,12 @@ export const TRACE_JSON_MEDIA_TYPE = 'application/vnd.agentv.trace.v1+json' as c export type AgentVResultsRefName = (typeof AGENTV_RESULTS_REFS)[keyof typeof AGENTV_RESULTS_REFS]; -export type ResultArtifactFamily = 'traces' | 'transcripts' | 'outputs' | 'raw-logs' | 'screenshots'; +export type ResultArtifactFamily = + | 'traces' + | 'transcripts' + | 'outputs' + | 'raw-logs' + | 'screenshots'; export interface ResultArtifactPointer { readonly ref: AgentVResultsRefName | string; diff --git a/packages/core/test/evaluation/results-repo.test.ts b/packages/core/test/evaluation/results-repo.test.ts index 3d956420d..c66852b8f 100644 --- a/packages/core/test/evaluation/results-repo.test.ts +++ b/packages/core/test/evaluation/results-repo.test.ts @@ -5,8 +5,8 @@ import path from 'node:path'; import { afterEach, beforeEach, describe, expect, it } from 'bun:test'; -import { AGENTV_RESULTS_REFS } from '../../src/evaluation/result-artifact-contract.js'; import type { ResultsConfig } from '../../src/evaluation/loaders/config-loader.js'; +import { AGENTV_RESULTS_REFS } from '../../src/evaluation/result-artifact-contract.js'; import { DEFAULT_RESULTS_BRANCH, buildWipBranchName, @@ -52,6 +52,17 @@ function createResultsConfig(repoDir: string, cloneDir: string): ResultsConfig { }; } +function refsHavePrefixConflict(refs: readonly string[]): boolean { + for (const ref of refs) { + for (const other of refs) { + if (ref !== other && other.startsWith(`${ref}/`)) { + return true; + } + } + } + return false; +} + function initializeRemoteRepo(rootDir: string): { remoteDir: string; seedDir: string } { const remoteDir = path.join(rootDir, 'results-remote.git'); git(`git init --bare --initial-branch=main --quiet "${remoteDir}"`, rootDir); @@ -336,9 +347,10 @@ describe('results repo write path', () => { expect(DEFAULT_RESULTS_BRANCH).toBe(AGENTV_RESULTS_REFS.primary); expect(AGENTV_RESULTS_REFS).toEqual({ primary: 'agentv/results/v1', - artifacts: 'agentv/results/v1/artifacts', - oplog: 'agentv/results/v1/oplog', + artifacts: 'agentv/artifacts/v1', + oplog: 'agentv/oplog/v1', }); + expect(refsHavePrefixConflict(Object.values(AGENTV_RESULTS_REFS))).toBe(false); expect(normalized.branch).toBe('agentv/results/v1'); expect(normalized.repo_path).toBe('/tmp/source-project'); expect(normalized.auto_push).toBe(false); From 3501891bb8f929e015b4d43976374c13632502af Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 21 Jun 2026 06:07:41 +0200 Subject: [PATCH 10/21] fix(results): avoid prefix-conflicting oplog ref --- apps/cli/src/commands/results/run-oplog.ts | 2 +- apps/cli/src/commands/results/run-tags.ts | 2 +- .../test/commands/results/run-oplog.test.ts | 33 ++++++++++++++++++- 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/apps/cli/src/commands/results/run-oplog.ts b/apps/cli/src/commands/results/run-oplog.ts index afd295cba..c307e5841 100644 --- a/apps/cli/src/commands/results/run-oplog.ts +++ b/apps/cli/src/commands/results/run-oplog.ts @@ -8,7 +8,7 @@ import { randomUUID } from 'node:crypto'; * tag replacement, and the materialized final-state shape that readers consume. */ -export const RUN_OPLOG_REF = 'agentv/results/v1/oplog'; +export const RUN_OPLOG_REF = 'agentv/oplog/v1'; export const RUN_OPERATION_SCHEMA_VERSION = 'agentv.run_operation.v1'; export type RunFinalStateLifecycle = 'active' | 'hidden' | 'deleted'; diff --git a/apps/cli/src/commands/results/run-tags.ts b/apps/cli/src/commands/results/run-tags.ts index 4cf18e83d..98a6f5490 100644 --- a/apps/cli/src/commands/results/run-tags.ts +++ b/apps/cli/src/commands/results/run-tags.ts @@ -10,7 +10,7 @@ * { * "tags": ["baseline", "v2-prompt"], * "updated_at": "2026-04-10T00:00:00.000Z", - * "oplog_watermark": { "ref": "agentv/results/v1/oplog" } + * "oplog_watermark": { "ref": "agentv/oplog/v1" } * } * ``` * diff --git a/apps/cli/test/commands/results/run-oplog.test.ts b/apps/cli/test/commands/results/run-oplog.test.ts index 1ab0ce226..ff6990bb0 100644 --- a/apps/cli/test/commands/results/run-oplog.test.ts +++ b/apps/cli/test/commands/results/run-oplog.test.ts @@ -1,4 +1,5 @@ import { describe, expect, it } from 'bun:test'; +import { execFileSync } from 'node:child_process'; import { RUN_OPERATION_SCHEMA_VERSION, @@ -9,9 +10,39 @@ import { watermarkFromRunOperation, } from '../../../src/commands/results/run-oplog.js'; +const PRIMARY_RESULTS_REF = 'agentv/results/v1'; +const ARTIFACTS_REF = 'agentv/artifacts/v1'; + +function refsHavePrefixConflict(left: string, right: string): boolean { + return left === right || left.startsWith(`${right}/`) || right.startsWith(`${left}/`); +} + +function isValidGitBranchRef(ref: string): boolean { + try { + execFileSync('git', ['check-ref-format', `refs/heads/${ref}`], { stdio: 'ignore' }); + return true; + } catch { + return false; + } +} + describe('run operation log contract', () => { it('defines the stable oplog ref', () => { - expect(RUN_OPLOG_REF).toBe('agentv/results/v1/oplog'); + expect(RUN_OPLOG_REF).toBe('agentv/oplog/v1'); + }); + + it('keeps results, artifacts, and oplog refs non-prefix-conflicting', () => { + const refs = [PRIMARY_RESULTS_REF, ARTIFACTS_REF, RUN_OPLOG_REF]; + + for (const left of refs) { + expect(isValidGitBranchRef(left)).toBe(true); + } + + for (const [index, left] of refs.entries()) { + for (const right of refs.slice(index + 1)) { + expect(refsHavePrefixConflict(left, right)).toBe(false); + } + } }); it('builds a typed tag replacement operation envelope', () => { From d13e8abdca30edddb2ffb6225129a04ccaf0d9c9 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 21 Jun 2026 06:51:57 +0200 Subject: [PATCH 11/21] fix(artifacts): preserve raw logs in per-case evals --- packages/core/src/evaluation/orchestrator.ts | 1 + .../core/test/evaluation/orchestrator.test.ts | 44 +++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index cc0e460ed..213940190 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -2122,6 +2122,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise { + const tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-raw-provider-log-')); + const rawLogPath = path.join(tempDir, 'provider-native-session.jsonl'); + writeFileSync(rawLogPath, '{"event":"provider-native"}\n', 'utf8'); + + const provider = new SequenceProvider('mock', { + responses: [ + { + output: [{ role: 'assistant', content: 'Raw log evidence preserved.' }], + raw: { logFile: rawLogPath }, + }, + ], + }); + + const result = await runEvalCase({ + evalCase: baseTestCase, + provider, + target: baseTarget, + evaluators: evaluatorRegistry, + }); + + expect(result.rawProviderLogPath).toBe(rawLogPath); + + const outputDir = path.join(tempDir, 'artifacts'); + await writeArtifactsFromResults([result], outputDir); + + const outputsDir = path.join(outputDir, 'test-dataset', 'case-1', 'outputs'); + expect(readFileSync(path.join(outputsDir, 'raw', 'provider.log'), 'utf8')).toBe( + '{"event":"provider-native"}\n', + ); + expect(readdirSync(outputsDir)).toContain('transcript.jsonl'); + expect(readdirSync(outputsDir)).not.toContain('transcript.json'); + + const indexRows = readFileSync(path.join(outputDir, 'index.jsonl'), 'utf8') + .trim() + .split('\n') + .map((line) => JSON.parse(line) as Record); + expect(indexRows[0]?.raw_provider_log_path).toBe( + 'test-dataset/case-1/outputs/raw/provider.log', + ); + expect(indexRows[0]?.transcript_path).toBe('test-dataset/case-1/outputs/transcript.jsonl'); + }); + it('reports failed progress status for batch item errors', async () => { class BatchProvider implements Provider { readonly id = 'batch:mock'; From f8d70fee15fdf5acc88b3c178605e280a039c6fb Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 21 Jun 2026 06:53:17 +0200 Subject: [PATCH 12/21] fix(results): preserve local tag clear watermark --- apps/cli/src/commands/results/run-tags.ts | 13 +--- apps/cli/src/commands/results/serve.ts | 13 +++- .../test/commands/results/run-tags.test.ts | 52 +++++++++++++ apps/cli/test/commands/results/serve.test.ts | 77 +++++++++++++++++++ .../dashboard/src/components/AnalyticsTab.tsx | 2 +- apps/dashboard/src/lib/api.ts | 4 +- .../src/content/docs/docs/tools/dashboard.mdx | 2 +- 7 files changed, 147 insertions(+), 16 deletions(-) create mode 100644 apps/cli/test/commands/results/run-tags.test.ts diff --git a/apps/cli/src/commands/results/run-tags.ts b/apps/cli/src/commands/results/run-tags.ts index 98a6f5490..9464aa068 100644 --- a/apps/cli/src/commands/results/run-tags.ts +++ b/apps/cli/src/commands/results/run-tags.ts @@ -24,7 +24,7 @@ * - No control characters (\n, \t, DEL, etc.) * - Tags are deduplicated case-sensitively * - A run can have at most 20 tags - * - Writing an empty array removes the sidecar file + * - Writing an empty array records a clear/tombstone state with a watermark * * To extend (e.g. add colored labels or descriptions): add optional fields * to `RunTagsFile` and keep the schema additive so older files still parse. @@ -85,7 +85,6 @@ export function readRunTags(manifestPath: string): RunTagsFile | undefined { const tags = record.tags.filter( (t): t is string => typeof t === 'string' && t.trim().length > 0, ); - if (tags.length === 0) return undefined; const updatedAt = typeof record.updated_at === 'string' ? record.updated_at : ''; return { tags, @@ -98,15 +97,11 @@ export function readRunTags(manifestPath: string): RunTagsFile | undefined { } /** - * Write tags for a run. Replaces any existing tags. Pass an empty array - * to remove the sidecar entirely. + * Write tags for a run. Replaces any existing tags. Pass an empty array to + * record that tags were intentionally cleared while preserving the watermark. */ -export function writeRunTags(manifestPath: string, tags: readonly string[]): RunTagsFile | null { +export function writeRunTags(manifestPath: string, tags: readonly string[]): RunTagsFile { const cleaned = normalizeTags(tags); - if (cleaned.length === 0) { - deleteRunTags(manifestPath); - return null; - } const runPath = inferRunRelativePath(manifestPath); const operation = createRunTagsSetOperation({ runId: buildRunIdFromRelativePath(runPath), diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 8350567cc..57a71847b 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -85,7 +85,7 @@ import { type RunReadStateFields, materializeRunState, } from './run-oplog.js'; -import { deleteRunTags, readRunTags, writeRunTags } from './run-tags.js'; +import { readRunTags, writeRunTags } from './run-tags.js'; import { type StudioConfig, loadStudioConfig, saveStudioConfig } from './studio-config.js'; // ── Source resolution ──────────────────────────────────────────────────── @@ -1542,10 +1542,17 @@ async function handleRunTagsDelete(c: C, { searchDir, projectId }: DataContext) }); } - deleteRunTags(meta.path); + const entry = writeRunTags(meta.path, []); + const responseState = localTagMutationResponse({ + tags: entry.tags, + updatedAt: entry.updated_at, + watermark: entry.oplog_watermark, + }); return c.json({ ok: true, - ...localTagMutationResponse({ tags: [] }), + tags: entry.tags, + ...responseState, + updated_at: entry.updated_at, }); } catch (err) { return c.json({ error: (err as Error).message }, remoteMetadataErrorStatus(err)); diff --git a/apps/cli/test/commands/results/run-tags.test.ts b/apps/cli/test/commands/results/run-tags.test.ts new file mode 100644 index 000000000..23dabb79c --- /dev/null +++ b/apps/cli/test/commands/results/run-tags.test.ts @@ -0,0 +1,52 @@ +import { afterEach, beforeEach, describe, expect, it } from 'bun:test'; +import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; + +import { RUN_OPLOG_REF } from '../../../src/commands/results/run-oplog.js'; +import { + deleteRunTags, + readRunTags, + runTagsPath, + writeRunTags, +} from '../../../src/commands/results/run-tags.js'; + +describe('run tags sidecar', () => { + let tempDir: string; + let manifestPath: string; + + beforeEach(() => { + tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-run-tags-')); + const runDir = path.join(tempDir, '.agentv', 'results', 'runs', 'default', '2026-clear-tags'); + mkdirSync(runDir, { recursive: true }); + manifestPath = path.join(runDir, 'index.jsonl'); + writeFileSync(manifestPath, '{"test_id":"alpha","score":1}\n', 'utf8'); + }); + + afterEach(() => { + rmSync(tempDir, { recursive: true, force: true }); + }); + + it('records empty tags as a clear tombstone with an oplog watermark', () => { + writeRunTags(manifestPath, ['baseline']); + + const cleared = writeRunTags(manifestPath, []); + const reloaded = readRunTags(manifestPath); + + expect(existsSync(runTagsPath(manifestPath))).toBe(true); + expect(cleared.tags).toEqual([]); + expect(cleared.oplog_watermark?.ref).toBe(RUN_OPLOG_REF); + expect(cleared.oplog_watermark?.operation_id).toBeString(); + expect(reloaded).toEqual(cleared); + expect(readFileSync(runTagsPath(manifestPath), 'utf8')).toContain('"tags": []'); + }); + + it('keeps physical sidecar deletion explicit', () => { + writeRunTags(manifestPath, []); + + deleteRunTags(manifestPath); + + expect(existsSync(runTagsPath(manifestPath))).toBe(false); + expect(readRunTags(manifestPath)).toBeUndefined(); + }); +}); diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index 4e7756542..c068a2211 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -1028,6 +1028,83 @@ describe('serve app', () => { }); }); + it('preserves a local tag clear watermark after DELETE /tags', async () => { + const runsDir = path.join(tempDir, '.agentv', 'results', 'runs'); + mkdirSync(runsDir, { recursive: true }); + const filename = '2026-03-25T10-30-00-000Z'; + const runDir = path.join(runsDir, filename); + mkdirSync(runDir, { recursive: true }); + writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(RESULT_A)); + writeFileSync( + path.join(runDir, 'tags.json'), + `${JSON.stringify( + { + tags: ['accepted'], + updated_at: '2026-06-21T10:15:00.000Z', + oplog_watermark: { + ref: RUN_OPLOG_REF, + operation_id: 'op-before-clear', + updated_at: '2026-06-21T10:15:00.000Z', + }, + }, + null, + 2, + )}\n`, + ); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + + const deleteRes = await app.request(`/api/runs/${encodeURIComponent(filename)}/tags`, { + method: 'DELETE', + }); + expect(deleteRes.status).toBe(200); + const deleteData = (await deleteRes.json()) as { + ok: boolean; + tags: string[]; + final_state: { lifecycle: string; tags: string[] }; + oplog_watermark: { ref: string; operation_id?: string; updated_at?: string }; + updated_at: string; + }; + expect(deleteData.ok).toBe(true); + expect(deleteData.tags).toEqual([]); + expect(deleteData.final_state).toEqual({ + lifecycle: 'active', + tags: [], + }); + expect(deleteData.oplog_watermark.ref).toBe(RUN_OPLOG_REF); + expect(deleteData.oplog_watermark.operation_id).toBeString(); + expect(deleteData.oplog_watermark.operation_id).not.toBe('op-before-clear'); + expect(deleteData.oplog_watermark.updated_at).toBe(deleteData.updated_at); + + const tagFile = JSON.parse(readFileSync(path.join(runDir, 'tags.json'), 'utf8')) as { + tags: string[]; + oplog_watermark: { ref: string; operation_id?: string; updated_at?: string }; + }; + expect(tagFile.tags).toEqual([]); + expect(tagFile.oplog_watermark.operation_id).toBe(deleteData.oplog_watermark.operation_id); + + const reloadedApp = createApp([], tempDir, tempDir, undefined, { studioDir }); + const detailRes = await reloadedApp.request(`/api/runs/${encodeURIComponent(filename)}`); + expect(detailRes.status).toBe(200); + const detailData = (await detailRes.json()) as { + tags: string[]; + final_state: { lifecycle: string; tags: string[] }; + oplog_watermark: { ref: string; operation_id?: string; updated_at?: string }; + }; + expect(detailData).toMatchObject({ + tags: [], + final_state: { + lifecycle: 'active', + tags: [], + }, + oplog_watermark: { + ref: RUN_OPLOG_REF, + operation_id: deleteData.oplog_watermark.operation_id, + updated_at: deleteData.oplog_watermark.updated_at, + }, + }); + }); + it('computes pass_rate using the configured dashboard threshold', async () => { const runsDir = path.join(tempDir, '.agentv', 'results', 'runs'); mkdirSync(runsDir, { recursive: true }); diff --git a/apps/dashboard/src/components/AnalyticsTab.tsx b/apps/dashboard/src/components/AnalyticsTab.tsx index 4c45c48aa..0378e4d76 100644 --- a/apps/dashboard/src/components/AnalyticsTab.tsx +++ b/apps/dashboard/src/components/AnalyticsTab.tsx @@ -15,7 +15,7 @@ * Backend contract: * - `GET /api/compare` → { cells, runs? } * - `PUT /api/runs/:runId/tags` → replaces sidecar tags.json - * - `DELETE /api/runs/:runId/tags` → removes sidecar + * - `DELETE /api/runs/:runId/tags` → records an empty tag state * * To extend with a new mode: add a value to `ViewMode`, a button in the mode * toggle, and a new body component in the content switch. Hooks in any new diff --git a/apps/dashboard/src/lib/api.ts b/apps/dashboard/src/lib/api.ts index 467c3b216..960b37160 100644 --- a/apps/dashboard/src/lib/api.ts +++ b/apps/dashboard/src/lib/api.ts @@ -665,7 +665,7 @@ export async function deleteRunApi(runId: string, projectId?: string): Promise; } -/** Remove the tags sidecar for a run. */ +/** Clear the tags for a run while preserving the clear watermark. */ export async function deleteRunTagsApi(runId: string, projectId?: string): Promise { const url = projectId ? `${projectApiBase(projectId)}/runs/${encodeURIComponent(runId)}/tags` diff --git a/apps/web/src/content/docs/docs/tools/dashboard.mdx b/apps/web/src/content/docs/docs/tools/dashboard.mdx index fadec9963..fd5f9eced 100644 --- a/apps/web/src/content/docs/docs/tools/dashboard.mdx +++ b/apps/web/src/content/docs/docs/tools/dashboard.mdx @@ -146,7 +146,7 @@ Select 2+ rows with the checkboxes and click the sticky **Compare N** action to ### Retroactive tags -Click any row's **Tags** cell to tag a run after the fact. Each run can carry multiple free-form tags (max 20, up to 60 characters each); local tags are stored in a `tags.json` sidecar next to `index.jsonl` in the run workspace, so they're mutable, non-destructive, and won't touch your eval YAML or run manifest. The chip editor supports Enter/comma to commit a new tag, Backspace to remove the last chip, and **Clear all** to remove every tag (deletes the sidecar). +Click any row's **Tags** cell to tag a run after the fact. Each run can carry multiple free-form tags (max 20, up to 60 characters each); local tags are stored in a `tags.json` sidecar next to `index.jsonl` in the run workspace, so they're mutable, non-destructive, and won't touch your eval YAML or run manifest. The chip editor supports Enter/comma to commit a new tag, Backspace to remove the last chip, and **Clear all** to record an empty tag state with an operation watermark. Remote run payloads stay immutable, but their tags are editable. Dashboard writes remote tag changes as metadata overlays under `.agentv/results/metadata/runs/.../tags.json` in the configured results repo clone. Until those overlays are synced, the run and project show a dirty state; **Sync Project** commits and pushes them when it is safe to do so. From d24c97011c7bf7df621d82146966b3e13d11ca6c Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 21 Jun 2026 06:53:26 +0200 Subject: [PATCH 13/21] fix(dashboard): harden trace session read model --- .../__fixtures__/trace-session-read-model.ts | 6 + .../src/lib/trace-read-model.test.ts | 160 +++++++++++- apps/dashboard/src/lib/trace-read-model.ts | 242 ++++++++++++++++-- 3 files changed, 388 insertions(+), 20 deletions(-) diff --git a/apps/dashboard/src/lib/__fixtures__/trace-session-read-model.ts b/apps/dashboard/src/lib/__fixtures__/trace-session-read-model.ts index f6fb19680..631549774 100644 --- a/apps/dashboard/src/lib/__fixtures__/trace-session-read-model.ts +++ b/apps/dashboard/src/lib/__fixtures__/trace-session-read-model.ts @@ -26,6 +26,10 @@ export const traceSessionEnvelopeFixture = { 'agentv.test_id': 'nested-session', 'agentv.target': 'codex', 'custom.unknown_value': { nested_value: true }, + external_trace_url: + 'https://phoenix.example/projects/agentv-dogfood/traces/phoenix-trace-456?api_key=secret', + external_trace_token: 'secret-span-token', + access_token: 'secret-access-token', 'gen_ai.usage.input_tokens': 14, 'gen_ai.usage.output_tokens': 9, }, @@ -38,6 +42,8 @@ export const traceSessionEnvelopeFixture = { text: 'Reviewer note', passed: true, extra_context: { source: 'grader' }, + authorization: 'Bearer secret', + nested: { password: 'secret', safe_value: 'visible' }, }, }, { diff --git a/apps/dashboard/src/lib/trace-read-model.test.ts b/apps/dashboard/src/lib/trace-read-model.test.ts index 474573813..68127da36 100644 --- a/apps/dashboard/src/lib/trace-read-model.test.ts +++ b/apps/dashboard/src/lib/trace-read-model.test.ts @@ -4,7 +4,11 @@ import { traceSessionEnvelopeFixture, traceSessionMissingOptionalFixture, } from './__fixtures__/trace-session-read-model'; -import { buildTraceSpanTree, traceEnvelopeToTraceSessionResponse } from './trace-read-model'; +import { + type TraceSpanNode, + buildTraceSpanTree, + traceEnvelopeToTraceSessionResponse, +} from './trace-read-model'; function expectSnakeCaseFixtureKeys(value: unknown, path: string[] = []): void { if (Array.isArray(value)) { @@ -24,6 +28,10 @@ function expectSnakeCaseFixtureKeys(value: unknown, path: string[] = []): void { } } +function flattenTree(nodes: readonly TraceSpanNode[]): TraceSpanNode[] { + return nodes.flatMap((node) => [node, ...flattenTree(node.children)]); +} + describe('trace session read model', () => { it('projects snake_case trace artifacts into stable Dashboard span trees', () => { const session = traceEnvelopeToTraceSessionResponse(traceSessionEnvelopeFixture, { @@ -65,6 +73,10 @@ describe('trace session read model', () => { expect(root?.duration_ms).toBe(1500); expect(root?.token_usage).toEqual({ input: 14, output: 9 }); expect(root?.attributes?.['custom.unknown_value']).toEqual({ nested_value: true }); + expect(root?.attributes?.['gen_ai.usage.input_tokens']).toBe(14); + expect(root?.attributes).not.toHaveProperty('external_trace_url'); + expect(root?.attributes).not.toHaveProperty('external_trace_token'); + expect(root?.attributes).not.toHaveProperty('access_token'); expect(session.events.map((event) => [event.event_id, event.kind, event.name])).toEqual([ ['annotation-1', 'annotation', 'agentv.annotation'], @@ -73,7 +85,7 @@ describe('trace session read model', () => { expect(session.events[0]).toMatchObject({ text: 'Reviewer note', passed: true, - attributes: { extra_context: { source: 'grader' } }, + attributes: { extra_context: { source: 'grader' }, nested: { safe_value: 'visible' } }, }); expect(session.events[1]).toMatchObject({ score: 0.82, @@ -109,6 +121,8 @@ describe('trace session read model', () => { }); expect(JSON.stringify(session.external_trace)).not.toContain('secret'); expect(JSON.stringify(session.external_trace)).not.toContain('api_key'); + expect(JSON.stringify(session)).not.toContain('secret'); + expect(JSON.stringify(session)).not.toContain('api_key'); expect(session.source?.metadata).toEqual({ safe_note: 'local artifact remains canonical', }); @@ -132,6 +146,148 @@ describe('trace session read model', () => { expect(JSON.stringify(session.external_trace)).not.toContain('not-a-url'); }); + it('preserves duplicate span IDs with collision-free node IDs and diagnostics', () => { + const tree = buildTraceSpanTree([ + { + id: 'root', + span_id: 'root', + parent_span_id: null, + name: 'root', + start_time_unix_nano: '1000', + }, + { + id: 'dup', + span_id: 'dup', + parent_span_id: 'root', + name: 'first duplicate', + start_time_unix_nano: '1100', + }, + { + id: 'dup', + span_id: 'dup', + parent_span_id: 'root', + name: 'second duplicate', + start_time_unix_nano: '1200', + }, + ]); + const nodes = flattenTree(tree); + + expect(nodes.map((node) => node.id)).toEqual(['root', 'dup', 'dup#2']); + expect(nodes.map((node) => node.span.name)).toEqual([ + 'root', + 'first duplicate', + 'second duplicate', + ]); + expect(nodes[2].diagnostics?.map((diagnostic) => diagnostic.code)).toEqual([ + 'duplicate_span_id', + ]); + }); + + it('promotes self-parented spans and ancestor cycles to diagnostic roots', () => { + const tree = buildTraceSpanTree([ + { + id: 'self', + span_id: 'self', + parent_span_id: 'self', + name: 'self', + start_time_unix_nano: '3000', + }, + { + id: 'cycle-a', + span_id: 'cycle-a', + parent_span_id: 'cycle-b', + name: 'cycle-a', + start_time_unix_nano: '1000', + }, + { + id: 'cycle-b', + span_id: 'cycle-b', + parent_span_id: 'cycle-a', + name: 'cycle-b', + start_time_unix_nano: '2000', + }, + ]); + const nodes = flattenTree(tree); + + expect(tree.map((node) => node.spanId)).toEqual(['cycle-a', 'cycle-b', 'self']); + expect(nodes.every((node) => node.children.length === 0)).toBe(true); + expect(nodes.map((node) => node.diagnostics?.[0]?.code)).toEqual([ + 'cycle', + 'cycle', + 'self_parent', + ]); + }); + + it('keeps missing-ID and missing-parent spans as diagnostic roots', () => { + const tree = buildTraceSpanTree([ + { + id: '', + span_id: '', + parent_span_id: null, + name: 'missing id', + }, + { + id: 'orphan', + span_id: 'orphan', + parent_span_id: 'missing-parent', + name: 'orphan', + }, + ]); + + expect(tree.map((node) => node.id)).toEqual(['missing-span-0', 'orphan']); + expect(tree.map((node) => node.diagnostics?.[0]?.code)).toEqual([ + 'missing_span_id', + 'missing_parent', + ]); + }); + + it('sorts roots and children by start time with stable span ID tie breaks', () => { + const tree = buildTraceSpanTree([ + { + id: 'root-b', + span_id: 'root-b', + parent_span_id: null, + name: 'root-b', + start_time_unix_nano: '2000', + }, + { + id: 'child-late', + span_id: 'child-late', + parent_span_id: 'root-a', + name: 'child-late', + start_time_unix_nano: '1200', + }, + { + id: 'root-a', + span_id: 'root-a', + parent_span_id: null, + name: 'root-a', + start_time_unix_nano: '1000', + }, + { + id: 'child-early', + span_id: 'child-early', + parent_span_id: 'root-a', + name: 'child-early', + start_time_unix_nano: '1100', + }, + { + id: 'child-alpha', + span_id: 'child-alpha', + parent_span_id: 'root-a', + name: 'child-alpha', + start_time_unix_nano: '1200', + }, + ]); + + expect(tree.map((node) => node.spanId)).toEqual(['root-a', 'root-b']); + expect(tree[0].children.map((node) => node.spanId)).toEqual([ + 'child-early', + 'child-alpha', + 'child-late', + ]); + }); + it('keeps new API fixtures snake_case-only outside opaque attributes maps', () => { expectSnakeCaseFixtureKeys(traceSessionEnvelopeFixture); expectSnakeCaseFixtureKeys(traceSessionMissingOptionalFixture); diff --git a/apps/dashboard/src/lib/trace-read-model.ts b/apps/dashboard/src/lib/trace-read-model.ts index cc1b16d70..b14fa4c2b 100644 --- a/apps/dashboard/src/lib/trace-read-model.ts +++ b/apps/dashboard/src/lib/trace-read-model.ts @@ -22,6 +22,22 @@ export interface TraceSpanNode { parentSpanId?: string | null; span: TraceSessionSpan; children: TraceSpanNode[]; + diagnostics?: TraceSpanTreeDiagnostic[]; +} + +export type TraceSpanTreeDiagnosticCode = + | 'cycle' + | 'duplicate_span_id' + | 'missing_parent' + | 'missing_span_id' + | 'self_parent'; + +export interface TraceSpanTreeDiagnostic { + code: TraceSpanTreeDiagnosticCode; + message: string; + span_id?: string; + node_id?: string; + parent_span_id?: string; } function isRecord(value: unknown): value is Record { @@ -57,6 +73,10 @@ function compactRecord(value: Record): Record return Object.keys(compacted).length > 0 ? compacted : undefined; } +function nonEmptyArray(value: readonly T[] | undefined): readonly T[] | undefined { + return value && value.length > 0 ? value : undefined; +} + function unixNanoToIso(value: string | undefined): string | undefined { if (!value) { return undefined; @@ -138,6 +158,48 @@ function tokenUsageFromAttributes( return usage as TraceSessionTokenUsage | undefined; } +function isExternalTraceKey(key: string): boolean { + return ( + key === 'external_trace' || + key.startsWith('external_trace_') || + key.startsWith('external_trace.') + ); +} + +function isCredentialLikeKey(key: string): boolean { + const normalized = key.toLowerCase(); + if ( + normalized === 'token_usage' || + normalized.endsWith('_tokens') || + normalized.endsWith('.tokens') || + normalized.includes('usage.') + ) { + return false; + } + return /(^|[._-])(api[._-]?key|authorization|bearer|password|secret|private[._-]?key|access[._-]?token|auth[._-]?token|client[._-]?secret|id[._-]?token|refresh[._-]?token|session[._-]?token|token)($|[._-])/.test( + normalized, + ); +} + +function sanitizeAttributeMap( + value: Record | undefined, +): Record | undefined { + if (!value) { + return undefined; + } + const entries = Object.entries(value).flatMap(([key, entry]) => { + if (isExternalTraceKey(key) || isCredentialLikeKey(key)) { + return []; + } + if (isRecord(entry)) { + const nested = sanitizeAttributeMap(entry); + return nested ? [[key, nested] as const] : []; + } + return [[key, entry] as const]; + }); + return entries.length > 0 ? Object.fromEntries(entries) : undefined; +} + function spanStatusFromValue(value: unknown): TraceSessionSpan['status'] { const record = asRecord(value); if (!record) { @@ -235,6 +297,7 @@ function projectSpanEvent( } const attributes = asRecord(record.attributes); + const safeAttributes = sanitizeAttributeMap(attributes); return dropUndefined({ event_id: eventId(spanId, index, attributes), span_id: spanId, @@ -245,7 +308,7 @@ function projectSpanEvent( score: scoreFromEvent(attributes), text: textFromEvent(attributes), passed: passedFromEvent(attributes), - attributes, + attributes: safeAttributes, }); } @@ -259,6 +322,7 @@ function projectSpan(span: unknown, index: number): TraceSessionSpan | undefined const traceId = stringValue(record.trace_id); const parentSpanId = record.parent_span_id === null ? null : stringValue(record.parent_span_id); const attributes = asRecord(record.attributes); + const safeAttributes = sanitizeAttributeMap(attributes); const startTimeUnixNano = stringValue(record.start_time_unix_nano); const endTimeUnixNano = stringValue(record.end_time_unix_nano); const events = asArray(record.events) @@ -279,7 +343,7 @@ function projectSpan(span: unknown, index: number): TraceSessionSpan | undefined end_time: unixNanoToIso(endTimeUnixNano), duration_ms: durationMsFromNanos(startTimeUnixNano, endTimeUnixNano), token_usage: tokenUsageFromAttributes(attributes), - attributes, + attributes: safeAttributes, events: events.length > 0 ? events : undefined, }); } @@ -377,12 +441,7 @@ function sanitizeMetadata( return undefined; } const entries = Object.entries(value).flatMap(([key, entry]) => { - if ( - key === 'external_trace' || - key.startsWith('external_trace_') || - key.startsWith('external_trace.') || - isSecretLikeKey(key) - ) { + if (isExternalTraceKey(key) || isSecretLikeKey(key)) { return []; } if (isRecord(entry)) { @@ -463,22 +522,100 @@ export function traceEnvelopeToTraceSessionResponse( } export function buildTraceSpanTree(spans: readonly TraceSessionSpan[]): TraceSpanNode[] { - const nodes = new Map(); - const roots: TraceSpanNode[] = []; - - for (const span of spans) { - nodes.set(span.span_id, { - id: span.id, - spanId: span.span_id, + const nodes: TraceSpanNode[] = []; + const firstNodeBySpanId = new Map(); + const spanIdCounts = new Map(); + + spans.forEach((span, index) => { + const rawSpanId = stringValue(span.span_id); + const spanId = rawSpanId ?? `missing-span-${index}`; + const occurrence = (spanIdCounts.get(spanId) ?? 0) + 1; + spanIdCounts.set(spanId, occurrence); + + const node: TraceSpanNode = { + id: occurrence === 1 ? spanId : `${spanId}#${occurrence}`, + spanId, parentSpanId: span.parent_span_id, span, children: [], + diagnostics: rawSpanId + ? undefined + : [ + { + code: 'missing_span_id', + message: 'Span was missing span_id and was assigned a stable node id.', + node_id: spanId, + }, + ], + }; + + if (occurrence > 1) { + addNodeDiagnostic(node, { + code: 'duplicate_span_id', + message: 'Duplicate span_id was preserved with a collision-free node id.', + span_id: spanId, + node_id: node.id, + }); + } + if (!firstNodeBySpanId.has(spanId)) { + firstNodeBySpanId.set(spanId, node); + } + nodes.push(node); + }); + + const parentByNodeId = new Map(); + for (const node of nodes) { + const parentSpanId = + typeof node.parentSpanId === 'string' && node.parentSpanId.length > 0 + ? node.parentSpanId + : undefined; + if (!parentSpanId) { + continue; + } + if (parentSpanId === node.spanId) { + addNodeDiagnostic(node, { + code: 'self_parent', + message: 'Span parent_span_id points to itself; span was promoted to a root.', + span_id: node.spanId, + node_id: node.id, + parent_span_id: parentSpanId, + }); + continue; + } + const parent = firstNodeBySpanId.get(parentSpanId); + if (!parent) { + addNodeDiagnostic(node, { + code: 'missing_parent', + message: 'Span parent_span_id was not present in this trace; span was promoted to a root.', + span_id: node.spanId, + node_id: node.id, + parent_span_id: parentSpanId, + }); + continue; + } + parentByNodeId.set(node.id, parent); + } + + const cyclicNodes: TraceSpanNode[] = []; + for (const node of nodes) { + if (hasAncestorCycle(node, parentByNodeId)) { + cyclicNodes.push(node); + } + } + for (const node of cyclicNodes) { + parentByNodeId.delete(node.id); + addNodeDiagnostic(node, { + code: 'cycle', + message: 'Span parent chain contains a cycle; span was promoted to a root.', + span_id: node.spanId, + node_id: node.id, + parent_span_id: typeof node.parentSpanId === 'string' ? node.parentSpanId : undefined, }); } - for (const node of nodes.values()) { - const parentId = typeof node.parentSpanId === 'string' ? node.parentSpanId : undefined; - const parent = parentId ? nodes.get(parentId) : undefined; + const roots: TraceSpanNode[] = []; + for (const node of nodes) { + const parent = parentByNodeId.get(node.id); if (parent) { parent.children.push(node); } else { @@ -486,5 +623,74 @@ export function buildTraceSpanTree(spans: readonly TraceSessionSpan[]): TraceSpa } } + sortTraceSpanNodes(roots); return roots; } + +function addNodeDiagnostic(node: TraceSpanNode, diagnostic: TraceSpanTreeDiagnostic): void { + node.diagnostics = [...(node.diagnostics ?? []), diagnostic]; +} + +function hasAncestorCycle( + node: TraceSpanNode, + parentByNodeId: ReadonlyMap, +): boolean { + const seen = new Set(); + let cursor = parentByNodeId.get(node.id); + while (cursor) { + if (cursor.id === node.id || seen.has(cursor.id)) { + return true; + } + seen.add(cursor.id); + cursor = parentByNodeId.get(cursor.id); + } + return false; +} + +function compareUnixNanoValue(first: string | undefined, second: string | undefined): number { + if (first === second) { + return 0; + } + if (!first) { + return 1; + } + if (!second) { + return -1; + } + try { + const firstValue = BigInt(first); + const secondValue = BigInt(second); + return firstValue < secondValue ? -1 : firstValue > secondValue ? 1 : 0; + } catch { + return first.localeCompare(second); + } +} + +function compareTraceSpanNodes(first: TraceSpanNode, second: TraceSpanNode): number { + const byStart = compareUnixNanoValue( + first.span.start_time_unix_nano, + second.span.start_time_unix_nano, + ); + if (byStart !== 0) { + return byStart; + } + if (first.spanId === second.parentSpanId) { + return -1; + } + if (second.spanId === first.parentSpanId) { + return 1; + } + const bySpanId = first.spanId.localeCompare(second.spanId); + return bySpanId !== 0 ? bySpanId : first.id.localeCompare(second.id); +} + +function sortTraceSpanNodes(nodes: TraceSpanNode[]): void { + nodes.sort(compareTraceSpanNodes); + for (const node of nodes) { + node.children.sort(compareTraceSpanNodes); + if (node.children.length > 0) { + sortTraceSpanNodes(node.children); + } + node.diagnostics = nonEmptyArray(node.diagnostics) as TraceSpanTreeDiagnostic[] | undefined; + } +} From cbb5979a0f2471f136d2035e705308aff5920a95 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 21 Jun 2026 06:53:54 +0200 Subject: [PATCH 14/21] fix(results): preserve artifact pointers when combining runs --- apps/cli/src/commands/results/combine-run.ts | 89 +++++++++++++++++- .../commands/eval/artifact-writer.test.ts | 23 +++++ .../cli/test/commands/results/combine.test.ts | 93 +++++++++++++++++++ .../core/src/evaluation/result-row-schema.ts | 26 +++++- 4 files changed, 228 insertions(+), 3 deletions(-) diff --git a/apps/cli/src/commands/results/combine-run.ts b/apps/cli/src/commands/results/combine-run.ts index 2e97ff711..48ce91ea8 100644 --- a/apps/cli/src/commands/results/combine-run.ts +++ b/apps/cli/src/commands/results/combine-run.ts @@ -20,7 +20,12 @@ import { } from 'node:fs'; import path from 'node:path'; -import type { EvaluationResult } from '@agentv/core'; +import type { + EvaluationResult, + ResultArtifactPointerWire, + ResultArtifactPointersWire, + TranscriptArtifactPointerWire, +} from '@agentv/core'; import { type BenchmarkArtifact, @@ -317,6 +322,15 @@ const MANIFEST_PATH_FIELDS = [ 'graders_path', ] as const; +const POINTER_FAMILIES = { + trace: 'traces', + transcript: 'transcripts', +} as const; + +function isSafeRelativeArtifactPath(relativePath: string): boolean { + return !path.isAbsolute(relativePath) && !relativePath.split(/[\\/]+/).includes('..'); +} + function copyReferencedArtifact( sourceBaseDir: string, outputDir: string, @@ -324,7 +338,7 @@ function copyReferencedArtifact( relativePath: string | undefined, ): string | undefined { if (!relativePath) return undefined; - if (path.isAbsolute(relativePath) || relativePath.split(/[\\/]+/).includes('..')) { + if (!isSafeRelativeArtifactPath(relativePath)) { throw new Error(`Unsafe artifact path in source manifest: ${relativePath}`); } const sourcePath = path.join(sourceBaseDir, relativePath); @@ -343,6 +357,71 @@ function copyReferencedArtifact( return rewritten; } +function rewriteArtifactPointer( + pointerName: keyof typeof POINTER_FAMILIES, + pointer: ResultArtifactPointerWire | undefined, + sourceBaseDir: string, + outputDir: string, + sourceIndex: number, +): ResultArtifactPointerWire | undefined { + if (!pointer) { + return undefined; + } + + if (!isSafeRelativeArtifactPath(pointer.path)) { + throw new Error(`Unsafe artifact path in source manifest: ${pointer.path}`); + } + const sourcePath = path.join(sourceBaseDir, pointer.path); + if (!existsSync(sourcePath)) { + return { ...pointer }; + } + + const rewrittenPath = copyReferencedArtifact(sourceBaseDir, outputDir, sourceIndex, pointer.path); + if (!rewrittenPath) { + return { ...pointer }; + } + + const family = pointer.family ?? POINTER_FAMILIES[pointerName]; + return { + ...pointer, + path: rewrittenPath, + key: path.posix.join(family, rewrittenPath), + }; +} + +function rewriteTranscriptArtifactPointer( + pointer: TranscriptArtifactPointerWire | undefined, + sourceBaseDir: string, + outputDir: string, + sourceIndex: number, +): TranscriptArtifactPointerWire | undefined { + return rewriteArtifactPointer('transcript', pointer, sourceBaseDir, outputDir, sourceIndex) as + | TranscriptArtifactPointerWire + | undefined; +} + +function rewriteArtifactPointers( + pointers: ResultArtifactPointersWire | undefined, + sourceBaseDir: string, + outputDir: string, + sourceIndex: number, +): ResultArtifactPointersWire | undefined { + if (!pointers) { + return undefined; + } + + return { + ...pointers, + trace: rewriteArtifactPointer('trace', pointers.trace, sourceBaseDir, outputDir, sourceIndex), + transcript: rewriteTranscriptArtifactPointer( + pointers.transcript, + sourceBaseDir, + outputDir, + sourceIndex, + ), + }; +} + function rewriteAndCopyRecord(row: SelectedRow, outputDir: string): ResultManifestRecord { const sourceBaseDir = path.dirname(row.source.manifestPath); const rewritten: Record = { ...row.record }; @@ -354,6 +433,12 @@ function rewriteAndCopyRecord(row: SelectedRow, outputDir: string): ResultManife row.record[field], ); } + rewritten.artifact_pointers = rewriteArtifactPointers( + row.record.artifact_pointers, + sourceBaseDir, + outputDir, + row.source.index, + ); return rewritten as unknown as ResultManifestRecord; } diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts index 07867988f..56c053262 100644 --- a/apps/cli/test/commands/eval/artifact-writer.test.ts +++ b/apps/cli/test/commands/eval/artifact-writer.test.ts @@ -599,6 +599,29 @@ describe('parseJsonlResults', () => { expect(results[0].trace.toolCalls).toEqual({ rg: 1 }); }); + it('rejects camelCase artifact pointer rows for the new wire field', () => { + const content = `${JSON.stringify({ + test_id: 'pointer-row', + target: 'codex', + score: 1, + artifactPointers: { + transcript: { + ref: 'agentv/artifacts/v1', + key: 'transcripts/pointer-row/outputs/transcript.jsonl', + object_version: 'sha256:test', + path: 'pointer-row/outputs/transcript.jsonl', + sha256: 'test', + size: 1, + schema_version: 'agentv.transcript.v1', + media_type: 'application/x-ndjson', + family: 'transcripts', + }, + }, + })}\n`; + + expect(() => parseJsonlResults(content)).toThrow(/Use "artifact_pointers"/); + }); + it('handles empty content', () => { expect(parseJsonlResults('')).toHaveLength(0); }); diff --git a/apps/cli/test/commands/results/combine.test.ts b/apps/cli/test/commands/results/combine.test.ts index 9d5dfc5d3..b35b15424 100644 --- a/apps/cli/test/commands/results/combine.test.ts +++ b/apps/cli/test/commands/results/combine.test.ts @@ -14,6 +14,13 @@ function toJsonl(...records: object[]): string { return `${records.map((record) => JSON.stringify(record)).join('\n')}\n`; } +function readIndex(filePath: string): Record[] { + return readFileSync(filePath, 'utf8') + .trim() + .split('\n') + .map((line) => JSON.parse(line) as Record); +} + const result = (overrides: Record = {}) => ({ timestamp: '2026-06-01T10:00:00.000Z', test_id: 'test-a', @@ -87,6 +94,92 @@ describe('results combine', () => { expect(benchmark.metadata.timestamp).toBe('2026-06-01T10:00:00.000Z'); }); + it('copies and rewrites artifact pointers when combining runs', () => { + const first = seedRun('run-a', [ + result({ + artifact_pointers: { + trace: { + ref: 'agentv/artifacts/v1', + key: 'traces/demo/test-a/outputs/trace.json', + object_version: 'sha256:trace', + path: 'demo/test-a/outputs/trace.json', + sha256: 'trace', + size: 18, + schema_version: 'agentv.trace.v1', + media_type: 'application/vnd.agentv.trace.v1+json', + family: 'traces', + }, + transcript: { + ref: 'agentv/artifacts/v1', + key: 'transcripts/demo/test-a/outputs/transcript.jsonl', + object_version: 'sha256:transcript', + path: 'demo/test-a/outputs/transcript.jsonl', + sha256: 'transcript', + size: 180, + schema_version: 'agentv.transcript.v1', + media_type: 'application/x-ndjson', + family: 'transcripts', + }, + }, + }), + ]); + mkdirSync(path.join(first, 'demo', 'test-a', 'outputs'), { recursive: true }); + writeFileSync(path.join(first, 'demo', 'test-a', 'outputs', 'trace.json'), '{"trace":[]}\n'); + writeFileSync( + path.join(first, 'demo', 'test-a', 'outputs', 'transcript.jsonl'), + `${JSON.stringify({ + schema_version: 'agentv.transcript.v1', + test_id: 'test-a', + target: 'mock', + message_index: 0, + role: 'assistant', + content: 'Pointer-backed transcript', + source: { provider: 'mock', session_id: 'session-a' }, + })}\n`, + ); + const second = seedRun('run-b', [ + result({ + timestamp: '2026-06-01T11:00:00.000Z', + test_id: 'test-b', + grading_path: 'demo/test-b/grading.json', + timing_path: 'demo/test-b/timing.json', + }), + ]); + mkdirSync(path.join(second, 'demo', 'test-b'), { recursive: true }); + writeFileSync(path.join(second, 'demo', 'test-b', 'grading.json'), '{"assertions":[]}\n'); + writeFileSync( + path.join(second, 'demo', 'test-b', 'timing.json'), + '{"duration_ms":0,"total_duration_seconds":0,"total_tokens":0,"token_usage":{}}\n', + ); + + const combined = combineRunSources({ + cwd: tempDir, + sources: buildCombineRunSources([first, second], tempDir), + duplicatePolicy: 'error', + }); + + const [record] = readIndex(combined.manifestPath); + expect(record).not.toHaveProperty('transcript_path'); + expect(record.artifact_pointers).toMatchObject({ + trace: { + key: 'traces/sources/source-1/demo/test-a/outputs/trace.json', + path: 'sources/source-1/demo/test-a/outputs/trace.json', + }, + transcript: { + key: 'transcripts/sources/source-1/demo/test-a/outputs/transcript.jsonl', + path: 'sources/source-1/demo/test-a/outputs/transcript.jsonl', + }, + }); + expect( + existsSync(path.join(combined.runDir, 'sources/source-1/demo/test-a/outputs/trace.json')), + ).toBe(true); + expect( + existsSync( + path.join(combined.runDir, 'sources/source-1/demo/test-a/outputs/transcript.jsonl'), + ), + ).toBe(true); + }); + it('errors on duplicate rows unless latest is explicit', () => { const first = seedRun('run-a', [result({ timestamp: '2026-06-01T10:00:00.000Z', score: 0.1 })]); const second = seedRun('run-b', [ diff --git a/packages/core/src/evaluation/result-row-schema.ts b/packages/core/src/evaluation/result-row-schema.ts index 5ec17dec1..fe457b84c 100644 --- a/packages/core/src/evaluation/result-row-schema.ts +++ b/packages/core/src/evaluation/result-row-schema.ts @@ -20,7 +20,6 @@ const MIGRATION_GUIDANCE = const RESULT_ROW_ALIASES = { answerPath: 'answer_path', - artifactPointers: 'artifact_pointers', artifactDir: 'artifact_dir', conversationId: 'conversation_id', costUsd: 'cost_usd', @@ -46,6 +45,10 @@ const RESULT_ROW_ALIASES = { workspacePath: 'workspace_path', } as const; +const NEW_SNAKE_CASE_ONLY_FIELDS = { + artifactPointers: 'artifact_pointers', +} as const; + const TRACE_SUMMARY_ALIASES = { costUsd: 'cost_usd', durationMs: 'duration_ms', @@ -150,6 +153,19 @@ function buildInvalidScoreError(context: { return new ResultRowSchemaError(`Missing or invalid score in result row${location}.`); } +function buildSnakeCaseOnlyFieldError( + field: keyof typeof NEW_SNAKE_CASE_ONLY_FIELDS, + context: { lineNumber?: number; sourceLabel?: string }, +): ResultRowSchemaError { + const location = [ + context.sourceLabel ? ` in ${context.sourceLabel}` : '', + context.lineNumber !== undefined ? ` at line ${context.lineNumber}` : '', + ].join(''); + return new ResultRowSchemaError( + `Unsupported camelCase result row field "${field}"${location}. Use "${NEW_SNAKE_CASE_ONLY_FIELDS[field]}".`, + ); +} + function looksLikeResultRow(value: Record): boolean { return ( typeof value.test_id === 'string' || @@ -170,6 +186,14 @@ export function normalizeResultRow( throw buildSchemaError(context); } + for (const field of Object.keys( + NEW_SNAKE_CASE_ONLY_FIELDS, + ) as (keyof typeof NEW_SNAKE_CASE_ONLY_FIELDS)[]) { + if (Object.hasOwn(value, field)) { + throw buildSnakeCaseOnlyFieldError(field, context); + } + } + const normalized = normalizeKnownAliases(value, RESULT_ROW_ALIASES); if (normalized.trace !== undefined) { normalized.trace = normalizeTraceSummary(normalized.trace); From bc9c3931c4b95fbd66ece8de3c6e1d9a36b609fc Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 21 Jun 2026 06:53:59 +0200 Subject: [PATCH 15/21] fix(results): align projection bundle refs with exports --- apps/cli/src/commands/results/export.ts | 46 +++++++++++++- .../src/commands/results/projection-bundle.ts | 62 +++++++++++++++---- apps/cli/test/commands/results/export.test.ts | 53 ++++++++++++++++ .../src/content/docs/docs/tools/results.mdx | 11 +++- 4 files changed, 154 insertions(+), 18 deletions(-) diff --git a/apps/cli/src/commands/results/export.ts b/apps/cli/src/commands/results/export.ts index 21120c27e..69e3f22d7 100644 --- a/apps/cli/src/commands/results/export.ts +++ b/apps/cli/src/commands/results/export.ts @@ -21,15 +21,18 @@ * - To add new per-test workspace files, add them under each test directory. */ +import { readFileSync } from 'node:fs'; import path from 'node:path'; import { command, flag, oneOf, option, optional, positional, string } from 'cmd-ts'; -import type { EvaluationResult, ExportDuplicatePolicy } from '@agentv/core'; +import type { EvaluationResult, ExportDuplicatePolicy, IndexArtifactEntry } from '@agentv/core'; import { parseJsonlResults, writeArtifactsFromResults } from '../eval/artifact-writer.js'; import { RESULT_INDEX_FILENAME } from '../eval/result-layout.js'; +import { loadManifestResults } from './manifest.js'; import { + type ProjectionBundle, buildProjectionBundle, serializeProjectionBundle, writeProjectionBundle, @@ -97,6 +100,36 @@ export async function loadExportSource( return { sourceFile, results }; } +function readIndexArtifactEntries(indexPath: string): IndexArtifactEntry[] { + return readFileSync(indexPath, 'utf8') + .split(/\r?\n/) + .map((line) => line.trim()) + .filter(Boolean) + .map((line) => JSON.parse(line) as IndexArtifactEntry); +} + +export function buildProjectionBundleFromExportedIndex(options: { + readonly sourceFile: string; + readonly outputDir: string; + readonly cwd?: string; + readonly includeRawContent?: boolean; + readonly duplicatePolicy?: ExportDuplicatePolicy; +}): ProjectionBundle { + const indexPath = path.join(options.outputDir, RESULT_INDEX_FILENAME); + const indexRecords = readIndexArtifactEntries(indexPath); + const emittedResults = loadManifestResults(indexPath); + + return buildProjectionBundle(emittedResults, { + sourceFile: options.sourceFile, + runId: deriveExportRunId(options.sourceFile), + cwd: options.cwd, + duplicatePolicy: options.duplicatePolicy, + includeRawContent: options.includeRawContent, + artifactRefStatus: 'emitted', + indexRecords, + }); +} + // ── CLI command ────────────────────────────────────────────────────────── export const resultsExportCommand = command({ @@ -186,7 +219,16 @@ export const resultsExportCommand = command({ }); const bundlePath = shouldWriteProjectionBundle - ? await writeProjectionBundle(buildBundle(), outputDir) + ? await writeProjectionBundle( + buildProjectionBundleFromExportedIndex({ + sourceFile, + outputDir, + cwd, + duplicatePolicy: policy, + includeRawContent: shouldIncludeRawContent, + }), + outputDir, + ) : undefined; // Report exported test IDs diff --git a/apps/cli/src/commands/results/projection-bundle.ts b/apps/cli/src/commands/results/projection-bundle.ts index b955af27c..d01cb12c8 100644 --- a/apps/cli/src/commands/results/projection-bundle.ts +++ b/apps/cli/src/commands/results/projection-bundle.ts @@ -100,7 +100,9 @@ export type ProjectionBundleArtifactRefs = Partial< | 'files_path' | 'graders_path' > & { readonly trace_path: string } ->; +> & { + readonly status: 'planned_export' | 'emitted'; +}; export interface BuildProjectionBundleOptions { readonly sourceFile: string; @@ -108,6 +110,8 @@ export interface BuildProjectionBundleOptions { readonly cwd?: string; readonly includeRawContent?: boolean; readonly duplicatePolicy?: ExportDuplicatePolicy; + readonly artifactRefStatus?: ProjectionBundleArtifactRefs['status']; + readonly indexRecords?: readonly IndexArtifactEntry[]; } function dropUndefined(value: T): T { @@ -147,11 +151,26 @@ function tracePathFor(indexEntry: IndexArtifactEntry): string | undefined { : undefined; } -function artifactRefs(indexEntry: IndexArtifactEntry): ProjectionBundleArtifactRefs { +function artifactRefs( + indexEntry: IndexArtifactEntry, + options: { + readonly includeRawContent: boolean; + readonly status: ProjectionBundleArtifactRefs['status']; + }, +): ProjectionBundleArtifactRefs { + const metadataRefs = dropUndefined({ + status: options.status, + timing_path: indexEntry.timing_path, + }); + + if (!options.includeRawContent) { + return metadataRefs; + } + return dropUndefined({ + ...metadataRefs, artifact_dir: indexEntry.artifact_dir, grading_path: indexEntry.grading_path, - timing_path: indexEntry.timing_path, input_path: indexEntry.input_path, output_path: indexEntry.output_path, answer_path: indexEntry.answer_path, @@ -210,6 +229,7 @@ function safeEnvelope( ...envelope.source, metadata: undefined, }, + artifacts: undefined, scores: envelope.scores?.map(({ evidence: _evidence, ...score }) => score), }); @@ -245,15 +265,17 @@ function rawContent(result: EvaluationResult): ProjectionBundleEntry['raw_conten function buildEntry( result: EvaluationResult, options: BuildProjectionBundleOptions, + indexRecord?: IndexArtifactEntry, ): ProjectionBundleEntry { const includeRawContent = options.includeRawContent ?? false; const sourcePath = toPortablePath(options.sourceFile, options.cwd); + const plannedIndexEntry = buildResultIndexArtifact(result); const envelope = buildTraceEnvelopeFromEvaluationResult(result, { evalPath: sourcePath, runId: options.runId, source: { kind: 'agentv_run', path: sourcePath, format: 'agentv_result' }, artifacts: { - trace_path: tracePathFor(buildResultIndexArtifact(result)), + trace_path: tracePathFor(indexRecord ?? plannedIndexEntry), answer_path: result.output.length > 0 ? 'outputs/answer.md' : undefined, response_path: result.output.length > 0 ? 'outputs/response.md' : undefined, }, @@ -266,16 +288,26 @@ function buildEntry( throw new Error(`Result ${result.testId ?? 'unknown'} is missing projection identity`); } - const indexEntry = buildResultIndexArtifact(result, undefined, { - projectionIdentity, - duplicatePolicy: options.duplicatePolicy, + const indexEntry = + indexRecord ?? + buildResultIndexArtifact(result, undefined, { + projectionIdentity, + duplicatePolicy: options.duplicatePolicy, + }); + const refs = artifactRefs(indexEntry, { + includeRawContent, + status: options.artifactRefStatus ?? 'planned_export', }); - const refs = artifactRefs(indexEntry); - const envelopeWire = safeEnvelope(toTraceEnvelopeWire(envelope), { includeRawContent }); - const projectionIdentityWire = envelopeWire.projection_identity; + const safeEnvelopeWire = safeEnvelope(toTraceEnvelopeWire(envelope), { includeRawContent }); + const projectionIdentityWire = + indexEntry.projection_identity ?? safeEnvelopeWire.projection_identity; if (!projectionIdentityWire) { throw new Error(`Result ${result.testId ?? 'unknown'} is missing projection identity`); } + const envelopeWire = { + ...safeEnvelopeWire, + projection_identity: projectionIdentityWire, + }; const scores = safeScores(envelopeWire.scores, { includeRawContent }); const feedback: ProjectionBundleEntry['feedback'] = dropUndefined({ @@ -293,13 +325,13 @@ function buildEntry( projection_identity: projectionIdentityWire, eval: envelopeWire.eval, artifact_refs: refs, - trace: { + trace: dropUndefined({ format: envelopeWire.trace.format, trace_id: envelopeWire.trace.trace_id, root_span_id: envelopeWire.trace.root_span_id, span_count: envelopeWire.trace.spans.length, envelope_ref: refs.trace_path, - }, + }), trace_envelope: envelopeWire, feedback, capture: envelopeWire.capture, @@ -318,13 +350,17 @@ export function buildProjectionBundle( throw new Error(`No results found in ${options.sourceFile}`); } - const entries = results.map((result) => buildEntry(result, options)); + const entries = results.map((result, index) => + buildEntry(result, options, options.indexRecords?.[index]), + ); const includeRawContent = options.includeRawContent ?? false; + const artifactRefStatus = options.artifactRefStatus ?? 'planned_export'; const conversionWarnings = entries.flatMap((entry) => entry.conversion_warnings ?? []); const bundleId = `projection-bundle-${shortHash([ PROJECTION_BUNDLE_SCHEMA_VERSION, toPortablePath(options.sourceFile, options.cwd), options.runId, + artifactRefStatus, includeRawContent ? 'raw' : 'metadata', ...entries.map((entry) => entry.projection_id), ])}`; diff --git a/apps/cli/test/commands/results/export.test.ts b/apps/cli/test/commands/results/export.test.ts index ec4cc6940..e14fe7aae 100644 --- a/apps/cli/test/commands/results/export.test.ts +++ b/apps/cli/test/commands/results/export.test.ts @@ -11,6 +11,7 @@ import type { } from '../../../src/commands/eval/artifact-writer.js'; import { parseJsonlResults } from '../../../src/commands/eval/artifact-writer.js'; import { + buildProjectionBundleFromExportedIndex, deriveExportRunId, deriveOutputDir, exportResults, @@ -270,6 +271,19 @@ describe('results export', () => { raw_content_opt_in: false, default_capture: 'metadata', }); + expect(first.entries[0].artifact_refs).toMatchObject({ + status: 'planned_export', + timing_path: 'privacy/test-private/timing.json', + }); + expect(first.entries[0].artifact_refs).not.toHaveProperty('input_path'); + expect(first.entries[0].artifact_refs).not.toHaveProperty('output_path'); + expect(first.entries[0].artifact_refs).not.toHaveProperty('answer_path'); + expect(first.entries[0].artifact_refs).not.toHaveProperty('response_path'); + expect(first.entries[0].artifact_refs).not.toHaveProperty('transcript_path'); + expect(first.entries[0].artifact_refs).not.toHaveProperty('trace_path'); + expect(first.entries[0].feedback).not.toHaveProperty('grading_path'); + expect(first.entries[0].trace).not.toHaveProperty('envelope_ref'); + expect(first.entries[0].trace_envelope).not.toHaveProperty('artifacts'); expect(first.entries[0].projection_identity.dimensions.run_id).toBe('privacy-run'); expect(first.entries[0].trace_envelope.trace.spans.length).toBeGreaterThan(0); expect(first.entries[0].feedback.scores?.[0]).not.toHaveProperty('evidence'); @@ -301,6 +315,17 @@ describe('results export', () => { content: 'full', redaction_level: 'none', }); + expect(bundle.entries[0].artifact_refs).toMatchObject({ + status: 'planned_export', + input_path: 'privacy/test-private/input.md', + output_path: 'privacy/test-private/outputs/answer.md', + answer_path: 'privacy/test-private/outputs/answer.md', + response_path: 'privacy/test-private/outputs/response.md', + trace_path: 'privacy/test-private/outputs/trace.json', + }); + expect(bundle.entries[0].trace.envelope_ref).toBe('privacy/test-private/outputs/trace.json'); + expect(bundle.entries[0].trace_envelope.artifacts).toBeDefined(); + expect(bundle.entries[0].feedback.grading_path).toBe('privacy/test-private/grading.json'); expect(bundle.entries[0].raw_content).toBeDefined(); expect(bundle.entries[0].feedback.scores?.[0]).toHaveProperty('evidence'); expect(serialized).toContain('SECRET_PROMPT_TEXT'); @@ -445,6 +470,34 @@ describe('results export', () => { expect(readAnswer(outputDir, RESULT_FULL)).toBe('Hello, Alice!'); }); + it('builds projection bundles from emitted skipped artifacts for duplicate policy skip', async () => { + const sourceFile = path.join(tempDir, 'runs', 'retry-run', 'index.jsonl'); + const outputDir = path.join(tempDir, 'output'); + const updated = { ...RESULT_FULL, output: 'Skipped answer.' }; + + await exportResults(sourceFile, toJsonl(RESULT_FULL), outputDir, { + duplicatePolicy: 'update', + }); + await exportResults(sourceFile, toJsonl(updated), outputDir, { + duplicatePolicy: 'skip', + }); + + const bundle = buildProjectionBundleFromExportedIndex({ + sourceFile, + outputDir, + cwd: tempDir, + includeRawContent: true, + duplicatePolicy: 'skip', + }); + + expect(bundle.entries[0].artifact_refs.status).toBe('emitted'); + expect(bundle.entries[0].raw_content?.output).toBe('Hello, Alice!'); + expect(serializeProjectionBundle(bundle)).not.toContain('Skipped answer.'); + expect(bundle.entries[0].trace_envelope.projection_identity).toEqual( + readIndex(outputDir)[0].projection_identity, + ); + }); + it('fails duplicate projection artifacts when duplicate policy is error', async () => { const sourceFile = path.join(tempDir, 'runs', 'retry-run', 'index.jsonl'); const outputDir = path.join(tempDir, 'output'); diff --git a/apps/web/src/content/docs/docs/tools/results.mdx b/apps/web/src/content/docs/docs/tools/results.mdx index 8c888e4c7..75bcce7c3 100644 --- a/apps/web/src/content/docs/docs/tools/results.mdx +++ b/apps/web/src/content/docs/docs/tools/results.mdx @@ -131,11 +131,16 @@ agentv results export --dry-run > projection_bund Dry-run prints deterministic JSON and does not write export artifacts. Vendor adapters should consume either this JSON directly or the local -`projection_bundle.json` plus the referenced files such as `grading.json`, -`timing.json`, and `outputs/trace.json`. +`projection_bundle.json`. Dry-run refs are marked +`artifact_refs.status: "planned_export"` because the export tree has not been +written. Bundles written with `--projection-bundle` are built from the emitted +export `index.jsonl` and use `artifact_refs.status: "emitted"`. Raw prompt text, final output, and tool arguments/results are excluded by -default. To include them in the bundle, opt in explicitly: +default, and raw-bearing artifact refs such as `grading_path`, `input_path`, +`answer_path`, `response_path`, `transcript_path`, and `trace_path` are omitted from +metadata-only bundles. To include raw payloads and raw-bearing refs in the +bundle, opt in explicitly: ```bash agentv results export --dry-run --include-raw-content From ce7cacfc6921018177704a4f4ed366636912683d Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 21 Jun 2026 07:12:04 +0200 Subject: [PATCH 16/21] fix(results): finalize reviewed stack integration --- apps/cli/src/commands/results/manifest.ts | 25 ++++++++++++++++++- apps/cli/src/commands/results/serve.ts | 2 +- apps/cli/test/commands/results/serve.test.ts | 6 ++--- packages/core/src/evaluation/run-artifacts.ts | 16 +++++------- 4 files changed, 34 insertions(+), 15 deletions(-) diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts index d6ac1650a..2fbe3fbd5 100644 --- a/apps/cli/src/commands/results/manifest.ts +++ b/apps/cli/src/commands/results/manifest.ts @@ -148,8 +148,31 @@ function readOptionalJson(baseDir: string, relativePath: string | undefined): } } +function nonEmptyString(value: unknown): string | undefined { + return typeof value === 'string' && value.trim().length > 0 ? value : undefined; +} + +function artifactPointerPath(pointer: ArtifactPointer | undefined): string | undefined { + if (typeof pointer === 'string') { + return nonEmptyString(pointer); + } + if (!pointer) { + return undefined; + } + return ( + nonEmptyString(pointer.path) ?? + nonEmptyString(pointer.artifact_path) ?? + nonEmptyString(pointer.relative_path) + ); +} + function resolveTranscriptPath(record: ResultManifestRecord): string | undefined { - return record.transcript_path ?? record.artifact_pointers?.transcript?.path; + return ( + record.transcript_path ?? + record.artifact_pointers?.transcript?.path ?? + record.artifacts?.transcript_path ?? + artifactPointerPath(record.transcript ?? record.artifacts?.transcript) + ); } function hydrateInput( diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index bcef191ad..970e564ca 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -355,7 +355,7 @@ function resolveRecordArtifactPointer( const pointer = kind === 'transcript' - ? (record.transcript ?? record.artifacts?.transcript) + ? (record.transcript ?? record.artifacts?.transcript ?? record.artifact_pointers?.transcript) : record.artifacts?.answer; const pointerPath = artifactPointerPath(pointer); const description = artifactPointerDescription(pointer); diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index 23a886a23..7e6a357a3 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -2771,9 +2771,9 @@ describe('serve app', () => { toJsonl({ ...RESULT_A, experiment: 'pointer-transcript', - artifacts: { + artifact_pointers: { transcript: { - ref: 'agentv/results/v1/artifacts', + ref: 'agentv/artifacts/v1', path: artifactPath, }, }, @@ -2795,7 +2795,7 @@ describe('serve app', () => { expect(data.status).toBe('ok'); expect(data.transcript_path).toBe(artifactPath); expect(data.content).toBe(transcriptJsonl); - expect(data.pointer).toContain('agentv/results/v1/artifacts'); + expect(data.pointer).toContain('agentv/artifacts/v1'); }); it('returns a clear missing state when no transcript pointer is recorded', async () => { diff --git a/packages/core/src/evaluation/run-artifacts.ts b/packages/core/src/evaluation/run-artifacts.ts index 301e4dd16..9ca7ae2ab 100644 --- a/packages/core/src/evaluation/run-artifacts.ts +++ b/packages/core/src/evaluation/run-artifacts.ts @@ -23,23 +23,23 @@ import { } from './projection-identity.js'; import type { Message } from './providers/types.js'; import { extractLastAssistantContent } from './providers/types.js'; -import { normalizeResultRow } from './result-row-schema.js'; import { AGENTV_RESULTS_ARTIFACTS_REF, CANONICAL_TRACE_ARTIFACT_PATH, CANONICAL_TRANSCRIPT_ARTIFACT_PATH, - TRACE_JSON_MEDIA_TYPE, - TRANSCRIPT_JSONL_MEDIA_TYPE, - TRANSCRIPT_SCHEMA_VERSION, type ResultArtifactFamily, type ResultArtifactPointerWire, type ResultArtifactPointersWire, + TRACE_JSON_MEDIA_TYPE, + TRANSCRIPT_JSONL_MEDIA_TYPE, + TRANSCRIPT_SCHEMA_VERSION, type TranscriptArtifactPointerWire, toResultArtifactPointerWire, } from './result-artifact-contract.js'; +import { normalizeResultRow } from './result-row-schema.js'; import { - type TraceEnvelope, EXECUTION_TRACE_SCHEMA_VERSION, + type TraceEnvelope, buildTraceEnvelopeFromEvaluationResult, toTraceEnvelopeWire, traceEnvelopeToTranscriptMessages, @@ -817,10 +817,7 @@ async function writeTraceEnvelopeSidecar( return envelope; } -function buildSidecarArtifactKey( - family: ResultArtifactFamily, - runRelativePath: string, -): string { +function buildSidecarArtifactKey(family: ResultArtifactFamily, runRelativePath: string): string { return path.posix.join(family, runRelativePath); } @@ -938,7 +935,6 @@ export function buildIndexArtifactEntry( transcript_path: options.transcriptPath ? toRelativeArtifactPath(options.outputDir, options.transcriptPath) : undefined, - artifact_pointers: options.artifactPointers, raw_provider_log_path: options.rawProviderLogPath ? toRelativeArtifactPath(options.outputDir, options.rawProviderLogPath) : undefined, From b25b047568225231e9bea42e9938035dd31f1087 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 21 Jun 2026 07:36:59 +0200 Subject: [PATCH 17/21] fix(results): tolerate grader scores without assertions --- apps/cli/test/commands/results/export.test.ts | 33 +++++++++++++++++++ packages/core/src/evaluation/run-artifacts.ts | 2 +- .../core/src/evaluation/trace-envelope.ts | 2 +- 3 files changed, 35 insertions(+), 2 deletions(-) diff --git a/apps/cli/test/commands/results/export.test.ts b/apps/cli/test/commands/results/export.test.ts index e14fe7aae..434c45136 100644 --- a/apps/cli/test/commands/results/export.test.ts +++ b/apps/cli/test/commands/results/export.test.ts @@ -294,6 +294,39 @@ describe('results export', () => { expect(serialized).not.toContain('SECRET_SCORE_EVIDENCE'); }); + it('builds projection bundles when grader scores omit assertion arrays', () => { + const sourceFile = path.join(tempDir, 'runs', 'legacy-grader-run', 'index.jsonl'); + const [result] = parseJsonlResults( + toJsonl({ + ...RESULT_FULL, + scores: [ + { + name: 'legacy_grader', + type: 'llm-grader', + score: 1, + }, + ], + }), + ); + + const bundle = buildProjectionBundle([result], { + sourceFile, + runId: 'legacy-grader-run', + cwd: tempDir, + }); + + expect(bundle.entries[0].feedback.scores?.[0]).toMatchObject({ + name: 'legacy_grader', + type: 'llm-grader', + score: 1, + }); + expect(bundle.entries[0].trace_envelope.scores?.[0]).toMatchObject({ + name: 'legacy_grader', + type: 'llm-grader', + score: 1, + }); + }); + it('includes raw prompt, output, tool payloads, and score evidence only with opt-in', () => { const sourceFile = path.join(tempDir, 'runs', 'privacy-run', 'index.jsonl'); const [result] = parseJsonlResults(toJsonl(RESULT_WITH_RAW_PAYLOADS)); diff --git a/packages/core/src/evaluation/run-artifacts.ts b/packages/core/src/evaluation/run-artifacts.ts index 9ca7ae2ab..af93cc692 100644 --- a/packages/core/src/evaluation/run-artifacts.ts +++ b/packages/core/src/evaluation/run-artifacts.ts @@ -373,7 +373,7 @@ function toIndexScore(score: GraderResult): Record { score: score.score, weight: score.weight, verdict: score.verdict, - assertions: score.assertions.map(toIndexAssertion), + assertions: (score.assertions ?? []).map(toIndexAssertion), raw_request: score.rawRequest, input: score.input, target: score.target, diff --git a/packages/core/src/evaluation/trace-envelope.ts b/packages/core/src/evaluation/trace-envelope.ts index ba3244139..c1fbb961e 100644 --- a/packages/core/src/evaluation/trace-envelope.ts +++ b/packages/core/src/evaluation/trace-envelope.ts @@ -775,7 +775,7 @@ function scoresFromResult( targetSpanId, evidence: dropUndefined({ span_ids: [targetSpanId], - assertions: score.assertions.map((assertion) => + assertions: (score.assertions ?? []).map((assertion) => dropUndefined({ text: assertion.text, passed: assertion.passed, From e331bdacbe39f36e10c4ed7f10e6c2c6974a2038 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 21 Jun 2026 07:38:20 +0200 Subject: [PATCH 18/21] docs(dogfood): record reviewed stack results --- ...ogfood-integration-av-vwa-16-10-dogfood.md | 175 ++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 docs/dogfood-reports/2026-06-21-dogfood-integration-av-vwa-16-10-dogfood.md diff --git a/docs/dogfood-reports/2026-06-21-dogfood-integration-av-vwa-16-10-dogfood.md b/docs/dogfood-reports/2026-06-21-dogfood-integration-av-vwa-16-10-dogfood.md new file mode 100644 index 000000000..04b66ff1a --- /dev/null +++ b/docs/dogfood-reports/2026-06-21-dogfood-integration-av-vwa-16-10-dogfood.md @@ -0,0 +1,175 @@ +# Dogfood Report - dogfood-integration-av-vwa-16-10 + +> Diff-scoped CLI and Dashboard QA of `dogfood-integration-av-vwa-16-10` vs `origin/main`. Generated by `/ce-dogfood-beta` on 2026-06-21. + +## Diff Summary + +- Adds canonical AgentV artifact refs and pointer shapes for result rows, trace sidecars, transcript projections, raw provider logs, and projection bundles. +- Writes provider-neutral transcript artifacts at `outputs/transcript.jsonl` while keeping raw provider logs separate at `outputs/raw/provider.log`. +- Adds results combine/export/projection behavior that preserves or rewrites artifact pointers and keeps default exports metadata-oriented. +- Adds oplog-shaped run tag state, tag clear tombstones, and Dashboard/API fields for final run state and watermarks. +- Updates Dashboard API and UI so run lists/details stay metadata-oriented while the Transcript tab lazily loads canonical transcript content and handles missing/dangling/unsupported states. +- Adds a Dashboard trace read model that preserves problematic span graphs with diagnostics and sanitizes external trace or credential-like attributes. + +## Personas + +Source: `STRATEGY.md` "Who it's for". + +- **AI platform engineers and agent builders** - evaluate real agent workflows, compare targets, gate changes, and inspect portable run artifacts from the same workspace their teams already use. + +## Flows Tested + +### Flow A - Canonical Result Artifact Emission + +```mermaid +flowchart TD + A[Eval result is written] --> B[Per-test artifact directory is created] + B --> C[Trace envelope written to outputs/trace.json] + C --> D{Trace has transcript rows?} + D -->|Yes| E[Canonical transcript JSONL written to outputs/transcript.jsonl] + D -->|No| F[Transcript path and pointer are omitted] + E --> G{Raw provider log present?} + F --> G + G -->|Yes| H[Raw log copied to outputs/raw/provider.log] + G -->|No| I[Index row excludes raw_provider_log_path] + H --> J[Index row records raw_provider_log_path separately] + I --> K[Index row emits artifact_pointers with agentv/artifacts/v1] + J --> K + K --> L[Consumers parse snake_case row and reject new camelCase artifactPointers] +``` + +### Flow B - Dashboard Metadata and Lazy Transcript Loading + +```mermaid +flowchart TD + A[User opens Dashboard run list] --> B[API loads lightweight run metadata] + B --> C[User opens a run detail] + C --> D[API hydrates detail without transcript bodies] + D --> E[User selects an eval] + E --> F[Checks tab shows metadata and grader state] + F --> G[User opens Transcript tab] + G --> H{Canonical transcript pointer resolves?} + H -->|Yes| I[Transcript endpoint reads outputs/transcript.jsonl lazily] + H -->|Missing| J[No structured transcript state] + H -->|Dangling| K[Unavailable artifact state with path] + H -->|Unsupported| L[Unsupported pointer state with pointer details] + I --> M[Timeline renders transcript and raw/download links] +``` + +### Flow C - Combine Run Artifact Pointer Rewriting + +```mermaid +flowchart TD + A[User selects two or more run workspaces] --> B[Combine reads each index.jsonl] + B --> C{Duplicate test_id and target rows?} + C -->|Error policy| D[Conflict is reported] + C -->|Latest or explicit choice| E[Selected rows are kept] + C -->|No duplicates| E + E --> F[Referenced artifacts are copied under sources/source-N] + F --> G[Trace artifact pointers are rewritten] + G --> H[Transcript artifact pointers are rewritten] + H --> I[Combined index.jsonl points only at copied files] +``` + +### Flow D - Tags and Oplog Watermarks + +```mermaid +flowchart TD + A[Run metadata is read] --> B[Tag sidecar or remote state is materialized] + B --> C[User sets tags] + C --> D[run.tags.set operation watermark is written] + D --> E[Run list/detail exposes final_state and oplog_watermark] + E --> F[User clears tags] + F --> G[Empty tag tombstone is written] + G --> H[final_state.tags is empty and clear watermark is preserved] +``` + +### Flow E - Trace Read Model Hardening + +```mermaid +flowchart TD + A[Dashboard reads trace envelope] --> B[Project spans, events, scores, and external trace metadata] + B --> C[Credential-like and unsafe external attributes are removed] + C --> D{Span graph shape} + D -->|Duplicate span ids| E[Preserve nodes with collision-free ids and diagnostics] + D -->|Missing parents| F[Promote span to diagnostic root] + D -->|Self-parent or cycle| G[Promote cyclic spans to diagnostic roots] + E --> H[Stable tree is rendered] + F --> H + G --> H +``` + +### Flow F - Projection Bundle Export + +```mermaid +flowchart TD + A[User requests projection bundle or dry run] --> B[Completed run manifest is read] + B --> C{Raw content opted in?} + C -->|No default| D[Bundle records metadata-only capture policy] + D --> E[artifact_refs are planned_export and omit raw-bearing paths] + E --> F[Trace envelopes omit raw evidence and transcript metadata payloads] + C -->|Yes| G[Bundle includes raw content and emitted artifact refs] + G --> H[Adapters receive explicit full-content payload] +``` + +## Test Matrix & Results + +| # | Flow | Journey / Scenario | Status | Issue | Fix | Commit | +|---|------|--------------------|--------|-------|-----|--------| +| 1 | A | Artifact writer emits `outputs/transcript.jsonl`, canonical `artifact_pointers.transcript.ref=agentv/artifacts/v1`, and canonical trace pointer refs. | Pass | Verified by artifact-writer regression tests. | - | - | +| 2 | A | Raw provider log is copied to `outputs/raw/provider.log`, remains separate from canonical transcript rows, and parsed result rows do not treat it as a fresh source log. | Pass | Verified by artifact-writer and orchestrator tests. | - | - | +| 3 | A | New invalid camelCase `artifactPointers` rows are rejected while historical result-row aliases still normalize at the boundary. | Pass | Verified by parser/shared results tests. | - | - | +| 4 | C | Combining runs copies pointed trace/transcript files and rewrites pointer paths/keys to `sources/source-N/...`. | Pass | Verified by combine tests. | - | - | +| 5 | D | Local tag set and tag clear/tombstone operations preserve `final_state` and a fresh `oplog_watermark`. | Pass | Verified by tests and live API set/clear/readback against the fixture server. | - | - | +| 6 | B | Run list, run detail, compare, and index API routes stay metadata-oriented and do not read transcript bodies. | Pass | Verified by serve tests and live API detail payload without transcript body content. | - | - | +| 7 | B | Transcript endpoint returns lazy `ok`, `missing`, `dangling`, and pointer-shaped transcript states from canonical transcript pointers. | Pass | Verified by serve tests, live API calls, and browser Transcript tab states. | - | - | +| 8 | E | Trace read model handles duplicate spans, missing parents, self-parent/cycles, and sanitizes external/credential-like attributes. | Pass | Verified by Dashboard trace read-model tests. | - | - | +| 9 | F | Projection bundle dry-run/default export marks planned refs correctly and excludes raw-bearing payloads by default. | Fixed | Live dry run crashed when a hydrated grader score omitted `assertions`. | Added missing-array fallbacks in result index and trace envelope score serialization, plus regression coverage. | b25b0475 | +| 10 | B | Browser UAT: Dashboard run list/detail remains usable, Transcript tab lazy-loads canonical content, and console errors are absent. | Pass | Agent-browser verified run list/detail, canonical/missing/dangling/unsupported Transcript tab states, lazy request logs, and no page errors. | - | - | + +## What Was Fixed + +### Projection bundle dry run crashed on grader scores without assertions - `b25b0475` + +- **Symptom:** `agentv results export --projection-bundle --dry-run` crashed with `undefined is not an object (evaluating 'score.assertions.map')` when a hydrated grading artifact had a grader score without an `assertions` array. +- **Root cause:** `packages/core/src/evaluation/run-artifacts.ts` and `packages/core/src/evaluation/trace-envelope.ts` assumed every `GraderResult` carried `assertions`, but historical or hand-authored grading artifacts can omit that optional array. +- **Fix:** Normalize missing score assertions to an empty array in index-row score serialization and trace-envelope score evidence serialization. +- **Regression test:** `apps/cli/test/commands/results/export.test.ts` now builds a projection bundle from a grader score that omits `assertions`. + +## Console Errors + +None observed through `agent-browser errors` after canonical, missing, dangling, and unsupported Transcript tab checks. `agent-browser console` was also empty on the canonical transcript path. + +Expected test-suite stderr included git fallback warnings for intentionally invalid remote fixtures; the suite passed. + +## Evidence + +- Diff analyzed with `git diff --name-only origin/main...HEAD` and focused code reads across result writing, combine/export, serve, Dashboard detail/API, and trace read model paths. +- Built core with `bun --filter @agentv/core build`. +- Built Dashboard with `bun run build` from `apps/dashboard/`; Vite emitted only the existing large-chunk warning. +- Ran focused regression suite after the fix: `333 pass`, `0 fail`, `1372 expect() calls`, across 10 files. +- Live Dashboard/results server started from source against a local fixture project on port 3217. +- Live API checks covered run list/detail, transcript `ok`/`missing`/`dangling`/`unsupported`, tag set/clear/readback, and projection dry run. +- Browser UAT used `agent-browser` with a local fixture project. Screenshots were captured outside the public repo as `transcript-tab.png` and `transcript-unsupported.png`. + +## Human Verifications + +Not applicable. The proof used local fixtures and CLI/Dashboard APIs only; no OAuth, email, payment, SMS, or external provider leg was required. + +## Decisions for a Human + +None. + +## Learnings + +- Projection/export code must tolerate historical or hand-authored grader score records that omit optional arrays. Treat missing optional evidence as empty evidence rather than crashing export. +- The lazy transcript boundary is doing useful work: list/detail payloads remain small and metadata-oriented, while transcript body content is fetched only after the user opens the Transcript tab. +- Raw provider logs stay safe as separate evidence under `outputs/raw/provider.log`; they are not canonical transcripts and should not be reinterpreted as source logs on parsed result rows. + +## Final Status + +Pass after fix. The integrated results/artifacts/transcript stack is ready for review from this dogfood pass. + +Functional failure fixed locally: `b25b0475`. + +Human-decision blockers: none. From 979f625a16737c1db2415e8a55bfc67f2d6a1a1a Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 21 Jun 2026 07:46:18 +0200 Subject: [PATCH 19/21] fix(results): preserve portable reviewed artifact contracts --- apps/cli/src/commands/eval/run-eval.ts | 18 +++++---- apps/cli/src/commands/results/combine-run.ts | 13 ++++++- apps/cli/src/commands/results/manifest.ts | 1 + .../src/commands/results/remote-metadata.ts | 32 ++++++++++++++- .../commands/eval/artifact-writer.test.ts | 17 ++++++++ .../cli/test/commands/results/combine.test.ts | 22 ++++++++++- .../commands/results/remote-metadata.test.ts | 39 ++++++++++++++++--- 7 files changed, 125 insertions(+), 17 deletions(-) diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 154a0f5cc..fdf4b1869 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -361,6 +361,16 @@ export function trimOutputMessages( return output; } +export function prepareResultForJsonl( + result: EvaluationResult, + options: { readonly outputMessages: number | 'all' }, +): EvaluationResult { + return { + ...result, + output: trimOutputMessages(result.output, options.outputMessages), + }; +} + function normalizeOptions( rawOptions: Record, config?: Awaited>, @@ -1043,13 +1053,7 @@ async function runSingleEvalFile(params: { // Each message is trimmed to { role, content } only (no toolCalls, startTime, etc.). // Full output with tool calls goes to OTel. const resultWithMetadata = withSourceMetadata(result, testFilePath, options); - const trimmedOutput = trimOutputMessages(resultWithMetadata.output, options.outputMessages); - const serializableResult = { ...resultWithMetadata }; - serializableResult.rawProviderLogPath = undefined; - const trimmedResult: EvaluationResult = { - ...serializableResult, - output: trimmedOutput, - }; + const trimmedResult = prepareResultForJsonl(resultWithMetadata, options); await outputWriter.append(trimmedResult); // Export to OTel if exporter is configured (skip batch export when streaming is active) diff --git a/apps/cli/src/commands/results/combine-run.ts b/apps/cli/src/commands/results/combine-run.ts index 48ce91ea8..e77af2e5d 100644 --- a/apps/cli/src/commands/results/combine-run.ts +++ b/apps/cli/src/commands/results/combine-run.ts @@ -310,11 +310,14 @@ function toRunId(cwd: string, runDir: string): string { } const MANIFEST_PATH_FIELDS = [ + 'artifact_dir', 'grading_path', 'timing_path', 'input_path', 'output_path', 'response_path', + 'transcript_path', + 'raw_provider_log_path', 'task_dir', 'eval_path', 'targets_path', @@ -433,12 +436,20 @@ function rewriteAndCopyRecord(row: SelectedRow, outputDir: string): ResultManife row.record[field], ); } - rewritten.artifact_pointers = rewriteArtifactPointers( + const artifactPointers = rewriteArtifactPointers( row.record.artifact_pointers, sourceBaseDir, outputDir, row.source.index, ); + rewritten.artifact_pointers = artifactPointers; + if ( + row.record.transcript_path && + rewritten.transcript_path === row.record.transcript_path && + artifactPointers?.transcript?.path + ) { + rewritten.transcript_path = artifactPointers.transcript.path; + } return rewritten as unknown as ResultManifestRecord; } diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts index 2fbe3fbd5..253e49127 100644 --- a/apps/cli/src/commands/results/manifest.ts +++ b/apps/cli/src/commands/results/manifest.ts @@ -44,6 +44,7 @@ export interface ResultManifestRecord { readonly output_path?: string; readonly answer_path?: string; readonly transcript_path?: string; + readonly raw_provider_log_path?: string; readonly artifact_pointers?: ResultArtifactPointersWire; readonly transcript?: ArtifactPointer; readonly artifacts?: ArtifactPointerMap; diff --git a/apps/cli/src/commands/results/remote-metadata.ts b/apps/cli/src/commands/results/remote-metadata.ts index d9c00a1c3..6c3b1c0b0 100644 --- a/apps/cli/src/commands/results/remote-metadata.ts +++ b/apps/cli/src/commands/results/remote-metadata.ts @@ -134,6 +134,28 @@ function equalTags(a: readonly string[], b: readonly string[]): boolean { return a.every((tag, index) => tag === b[index]); } +function equalWatermarks( + a: RunOplogWatermark | undefined, + b: RunOplogWatermark | undefined, +): boolean { + return ( + a?.ref === b?.ref && + a?.operation_id === b?.operation_id && + a?.updated_at === b?.updated_at + ); +} + +function equalTagFiles(a: TagsFile | undefined, b: TagsFile | undefined): boolean { + if (a === undefined || b === undefined) { + return a === b; + } + return ( + equalTags(a.tags, b.tags) && + a.updatedAt === b.updatedAt && + equalWatermarks(a.oplogWatermark, b.oplogWatermark) + ); +} + function resolveComparisonRef(repoDir: string): string | undefined { const upstream = tryRunGit(repoDir, [ 'rev-parse', @@ -199,7 +221,9 @@ function readRemoteRunTagsContext(repoDir: string, manifestPath: string): Remote function toRemoteRunTagState(context: RemoteRunTagsContext): RemoteRunTagState { const remoteTags = context.baseOverlayTags?.tags ?? context.artifactTags?.tags ?? []; const effectiveTags = context.localOverlayTags?.tags ?? remoteTags; - const dirty = !equalTags(effectiveTags, remoteTags); + const dirty = context.localOverlayTags + ? !equalTagFiles(context.localOverlayTags, context.baseOverlayTags) + : !equalTags(effectiveTags, remoteTags); const watermark = context.localOverlayTags?.oplogWatermark ?? context.baseOverlayTags?.oplogWatermark ?? @@ -252,7 +276,11 @@ export function writeRemoteRunTags( const context = readRemoteRunTagsContext(repoDir, manifestPath); const remoteTags = context.baseOverlayTags?.tags ?? context.artifactTags?.tags ?? []; - if (equalTags(cleaned, remoteTags) && context.baseOverlayTags === undefined) { + if ( + cleaned.length > 0 && + equalTags(cleaned, remoteTags) && + context.baseOverlayTags === undefined + ) { rmSync(context.paths.overlayTagsPath, { force: true }); return readRemoteRunTags(repoDir, manifestPath); } diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts index 17deeba7f..71c1e88e9 100644 --- a/apps/cli/test/commands/eval/artifact-writer.test.ts +++ b/apps/cli/test/commands/eval/artifact-writer.test.ts @@ -36,6 +36,8 @@ import { writeArtifacts, writeArtifactsFromResults, } from '../../../src/commands/eval/artifact-writer.js'; +import { prepareResultForJsonl } from '../../../src/commands/eval/run-eval.js'; +import { toSnakeCaseDeep } from '../../../src/utils/case-conversion.js'; function makeResult(overrides: Partial = {}): EvaluationResult { const result = { @@ -637,6 +639,21 @@ describe('parseJsonlResults', () => { expect(results[0].rawProviderLogPath).toBeUndefined(); }); + it('preserves raw provider log pointer metadata at the per-case JSONL boundary', () => { + const rawLogPath = path.join(import.meta.dir, '.test-provider-source.log'); + const result = makeResult({ + testId: 'raw-log-jsonl-case', + rawProviderLogPath: rawLogPath, + }); + + const prepared = prepareResultForJsonl(result, { outputMessages: 1 }); + const wire = toSnakeCaseDeep(prepared) as Record; + + expect(prepared.rawProviderLogPath).toBe(rawLogPath); + expect(wire.raw_provider_log_path).toBe(rawLogPath); + expect(wire).not.toHaveProperty('raw_provider_log'); + }); + it('handles empty content', () => { expect(parseJsonlResults('')).toHaveLength(0); }); diff --git a/apps/cli/test/commands/results/combine.test.ts b/apps/cli/test/commands/results/combine.test.ts index b35b15424..cac10c0ae 100644 --- a/apps/cli/test/commands/results/combine.test.ts +++ b/apps/cli/test/commands/results/combine.test.ts @@ -97,6 +97,9 @@ describe('results combine', () => { it('copies and rewrites artifact pointers when combining runs', () => { const first = seedRun('run-a', [ result({ + artifact_dir: 'demo/test-a', + transcript_path: 'demo/test-a/outputs/transcript.jsonl', + raw_provider_log_path: 'demo/test-a/outputs/raw/provider.log', artifact_pointers: { trace: { ref: 'agentv/artifacts/v1', @@ -123,7 +126,7 @@ describe('results combine', () => { }, }), ]); - mkdirSync(path.join(first, 'demo', 'test-a', 'outputs'), { recursive: true }); + mkdirSync(path.join(first, 'demo', 'test-a', 'outputs', 'raw'), { recursive: true }); writeFileSync(path.join(first, 'demo', 'test-a', 'outputs', 'trace.json'), '{"trace":[]}\n'); writeFileSync( path.join(first, 'demo', 'test-a', 'outputs', 'transcript.jsonl'), @@ -137,6 +140,10 @@ describe('results combine', () => { source: { provider: 'mock', session_id: 'session-a' }, })}\n`, ); + writeFileSync( + path.join(first, 'demo', 'test-a', 'outputs', 'raw', 'provider.log'), + '{"event":"provider-native"}\n', + ); const second = seedRun('run-b', [ result({ timestamp: '2026-06-01T11:00:00.000Z', @@ -159,7 +166,13 @@ describe('results combine', () => { }); const [record] = readIndex(combined.manifestPath); - expect(record).not.toHaveProperty('transcript_path'); + expect(record.artifact_dir).toBe('sources/source-1/demo/test-a'); + expect(record.transcript_path).toBe( + 'sources/source-1/demo/test-a/outputs/transcript.jsonl', + ); + expect(record.raw_provider_log_path).toBe( + 'sources/source-1/demo/test-a/outputs/raw/provider.log', + ); expect(record.artifact_pointers).toMatchObject({ trace: { key: 'traces/sources/source-1/demo/test-a/outputs/trace.json', @@ -178,6 +191,11 @@ describe('results combine', () => { path.join(combined.runDir, 'sources/source-1/demo/test-a/outputs/transcript.jsonl'), ), ).toBe(true); + expect( + existsSync( + path.join(combined.runDir, 'sources/source-1/demo/test-a/outputs/raw/provider.log'), + ), + ).toBe(true); }); it('errors on duplicate rows unless latest is explicit', () => { diff --git a/apps/cli/test/commands/results/remote-metadata.test.ts b/apps/cli/test/commands/results/remote-metadata.test.ts index 8b47b6f86..44fe86dbc 100644 --- a/apps/cli/test/commands/results/remote-metadata.test.ts +++ b/apps/cli/test/commands/results/remote-metadata.test.ts @@ -34,7 +34,10 @@ function git(cmd: string, cwd: string): string { }).trim(); } -function seedRepo(repoDir: string): string { +function seedRepo( + repoDir: string, + options?: { readonly artifactTags?: readonly string[] }, +): string { git('git init --quiet', repoDir); git('git config user.email "test@example.com"', repoDir); git('git config user.name "Test User"', repoDir); @@ -42,10 +45,17 @@ function seedRepo(repoDir: string): string { const runDir = path.join(repoDir, 'runs', 'default', RUN_TIMESTAMP); mkdirSync(runDir, { recursive: true }); writeFileSync(path.join(runDir, 'index.jsonl'), '{"test_id":"alpha","score":1}\n'); - writeFileSync( - path.join(runDir, 'tags.json'), - `${JSON.stringify({ tags: ['remote-baseline'], updated_at: '2026-06-06T09:00:00.000Z' }, null, 2)}\n`, - ); + const artifactTags = options?.artifactTags ?? ['remote-baseline']; + if (artifactTags.length > 0) { + writeFileSync( + path.join(runDir, 'tags.json'), + `${JSON.stringify( + { tags: artifactTags, updated_at: '2026-06-06T09:00:00.000Z' }, + null, + 2, + )}\n`, + ); + } git('git add runs', repoDir); git('git commit --quiet -m "seed remote run"', repoDir); return path.join(runDir, 'index.jsonl'); @@ -117,6 +127,25 @@ describe('remote metadata tags', () => { expect(readFileSync(state.metadataPath, 'utf8')).toContain('"tags": []'); }); + it('records an explicit clear watermark when the remote baseline is already empty', () => { + const manifestPath = seedRepo(repoDir, { artifactTags: [] }); + + const state = writeRemoteRunTags(repoDir, manifestPath, []); + const metadata = JSON.parse(readFileSync(state.metadataPath, 'utf8')) as { + tags: string[]; + oplog_watermark: { ref: string; operation_id?: string; updated_at?: string }; + }; + + expect(state.tags).toEqual([]); + expect(state.remoteTags).toEqual([]); + expect(state.pendingTags).toEqual([]); + expect(state.dirty).toBe(true); + expect(state.oplogWatermark.ref).toBe(RUN_OPLOG_REF); + expect(state.oplogWatermark.operation_id).toBeString(); + expect(metadata.tags).toEqual([]); + expect(metadata.oplog_watermark.operation_id).toBe(state.oplogWatermark.operation_id); + }); + it('rejects writes when the configured results path is not a git checkout', () => { const runDir = path.join(repoDir, 'runs', 'default', RUN_TIMESTAMP); mkdirSync(runDir, { recursive: true }); From be40cd9d9b1ccc50ade56c168b8c59e36ec5f2f0 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 21 Jun 2026 07:53:43 +0200 Subject: [PATCH 20/21] fix(results): block symlink artifact escapes Canonicalize dashboard artifact targets before reading so in-run symlinks cannot expose files outside the run workspace. Cover transcript, answer, and raw file artifact reads with focused regressions. --- apps/cli/src/commands/results/serve.ts | 82 ++++++++---- apps/cli/test/commands/results/serve.test.ts | 129 ++++++++++++++++++- 2 files changed, 185 insertions(+), 26 deletions(-) diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 970e564ca..b6e8403b1 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -34,7 +34,15 @@ * - createApp(results, cwd) — Hono app factory */ -import { existsSync, mkdirSync, readFileSync, readdirSync, statSync, writeFileSync } from 'node:fs'; +import { + existsSync, + mkdirSync, + readFileSync, + readdirSync, + realpathSync, + statSync, + writeFileSync, +} from 'node:fs'; import { homedir } from 'node:os'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; @@ -381,22 +389,55 @@ function resolveRunArtifactPath( ): { absolutePath?: string; error?: string } { const absolutePath = path.resolve(baseDir, relativePath); const resolvedBase = path.resolve(baseDir); - if (absolutePath !== resolvedBase && !absolutePath.startsWith(`${resolvedBase}${path.sep}`)) { + if (!isPathInsideDirectory(resolvedBase, absolutePath)) { return { error: 'Artifact path is outside the run workspace.' }; } return { absolutePath }; } +function isPathInsideDirectory(baseDir: string, candidatePath: string): boolean { + const relative = path.relative(baseDir, candidatePath); + return relative === '' || (!!relative && !relative.startsWith('..') && !path.isAbsolute(relative)); +} + +function resolveReadableRunArtifactFile( + baseDir: string, + relativePath: string, +): { absolutePath?: string; error?: string } { + const resolved = resolveRunArtifactPath(baseDir, relativePath); + if (!resolved.absolutePath) return { error: resolved.error }; + + let realBase: string; + let realArtifact: string; + try { + realBase = realpathSync(baseDir); + realArtifact = realpathSync(resolved.absolutePath); + } catch { + return {}; + } + + if (!isPathInsideDirectory(realBase, realArtifact)) { + return { error: 'Artifact path is outside the run workspace.' }; + } + + try { + if (!statSync(realArtifact).isFile()) { + return {}; + } + } catch { + return {}; + } + + return { absolutePath: realArtifact }; +} + function readOptionalRunArtifactText( baseDir: string, artifact: ResolvedArtifactPointer, ): string | undefined { if (!artifact.path) return undefined; - const resolved = resolveRunArtifactPath(baseDir, artifact.path); + const resolved = resolveReadableRunArtifactFile(baseDir, artifact.path); if (!resolved.absolutePath) return undefined; - if (!existsSync(resolved.absolutePath) || !statSync(resolved.absolutePath).isFile()) { - return undefined; - } return readFileSync(resolved.absolutePath, 'utf8'); } @@ -1037,33 +1078,27 @@ async function handleEvalFileContent(c: C, { searchDir, projectId }: DataContext await ensureRunReadable(searchDir, meta, projectId); const baseDir = path.dirname(meta.path); - const absolutePath = path.resolve(baseDir, filePath); - - // Security: prevent path traversal — resolved path must be inside baseDir - if ( - !absolutePath.startsWith(path.resolve(baseDir) + path.sep) && - absolutePath !== path.resolve(baseDir) - ) { + const resolvedFile = resolveReadableRunArtifactFile(baseDir, filePath); + if (resolvedFile.error) { return c.json({ error: 'Path traversal not allowed' }, 403); } - - if (!existsSync(absolutePath) || !statSync(absolutePath).isFile()) { + if (!resolvedFile.absolutePath) { return c.json({ error: 'File not found' }, 404); } try { - const fileContent = readFileSync(absolutePath, 'utf8'); + const fileContent = readFileSync(resolvedFile.absolutePath, 'utf8'); if (c.req.query('raw') === '1' || c.req.query('download') === '1') { - c.header('Content-Type', inferRawContentType(absolutePath)); + c.header('Content-Type', inferRawContentType(filePath)); if (c.req.query('download') === '1') { c.header( 'Content-Disposition', - `attachment; filename="${contentDispositionFilename(absolutePath)}"`, + `attachment; filename="${contentDispositionFilename(filePath)}"`, ); } return c.body(fileContent); } - const language = inferLanguage(absolutePath); + const language = inferLanguage(filePath); return c.json({ content: fileContent, language }); } catch { return c.json({ error: 'Failed to read file' }, 500); @@ -1093,8 +1128,8 @@ async function handleEvalTranscript(c: C, { searchDir, projectId }: DataContext) }); } - const resolvedTranscript = resolveRunArtifactPath(baseDir, transcript.path); - if (!resolvedTranscript.absolutePath) { + const resolvedTranscript = resolveReadableRunArtifactFile(baseDir, transcript.path); + if (resolvedTranscript.error) { return c.json({ status: 'dangling', transcript_path: transcript.path, @@ -1103,10 +1138,7 @@ async function handleEvalTranscript(c: C, { searchDir, projectId }: DataContext) }); } - if ( - !existsSync(resolvedTranscript.absolutePath) || - !statSync(resolvedTranscript.absolutePath).isFile() - ) { + if (!resolvedTranscript.absolutePath) { const refMessage = transcript.ref ? ` on ${transcript.ref}` : ''; return c.json({ status: 'dangling', diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index 7e6a357a3..389089eaf 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -1,6 +1,14 @@ import { afterEach, beforeEach, describe, expect, it, spyOn } from 'bun:test'; import { execFileSync, execSync } from 'node:child_process'; -import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'; +import { + existsSync, + mkdirSync, + mkdtempSync, + readFileSync, + rmSync, + symlinkSync, + writeFileSync, +} from 'node:fs'; import os from 'node:os'; import { tmpdir } from 'node:os'; import path from 'node:path'; @@ -2849,6 +2857,94 @@ describe('serve app', () => { expect(data.message).toContain('not available'); }); + it('treats symlinked transcript artifacts outside the run workspace as dangling', async () => { + const secret = 'outside transcript secret'; + const outsidePath = path.join(tempDir, 'outside-transcript.jsonl'); + writeFileSync(outsidePath, secret); + + const runsDir = path.join(tempDir, '.agentv', 'results', 'runs', 'escaped-transcript'); + const runId = 'escaped-transcript::2026-03-25T13-30-00-000Z'; + const timestampDir = path.join(runsDir, '2026-03-25T13-30-00-000Z'); + const artifactPath = 'demo/test-greeting/outputs/transcript.jsonl'; + const symlinkPath = path.join(timestampDir, artifactPath); + + mkdirSync(path.dirname(symlinkPath), { recursive: true }); + symlinkSync(outsidePath, symlinkPath); + writeFileSync( + path.join(timestampDir, 'index.jsonl'), + toJsonl({ + ...RESULT_A, + experiment: 'escaped-transcript', + transcript_path: artifactPath, + }), + ); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + const res = await app.request( + `/api/runs/${encodeURIComponent(runId)}/evals/test-greeting/transcript`, + ); + + expect(res.status).toBe(200); + const text = await res.text(); + expect(text).not.toContain(secret); + const data = JSON.parse(text) as { status: string; transcript_path: string }; + expect(data.status).toBe('dangling'); + expect(data.transcript_path).toBe(artifactPath); + }); + + it('omits symlinked answer artifacts outside the run workspace from transcript responses', async () => { + const secret = 'outside answer secret'; + const outsidePath = path.join(tempDir, 'outside-answer.md'); + writeFileSync(outsidePath, secret); + + const runsDir = path.join(tempDir, '.agentv', 'results', 'runs', 'escaped-answer'); + const runId = 'escaped-answer::2026-03-25T13-45-00-000Z'; + const timestampDir = path.join(runsDir, '2026-03-25T13-45-00-000Z'); + const transcriptArtifactPath = 'demo/test-greeting/outputs/transcript.jsonl'; + const answerArtifactPath = 'demo/test-greeting/outputs/answer.md'; + const transcriptPath = path.join(timestampDir, transcriptArtifactPath); + const answerPath = path.join(timestampDir, answerArtifactPath); + const transcriptJsonl = `${JSON.stringify({ + test_id: 'test-greeting', + target: 'gpt-4o', + message_index: 0, + role: 'user', + content: 'Hello', + })}\n`; + + mkdirSync(path.dirname(transcriptPath), { recursive: true }); + writeFileSync(transcriptPath, transcriptJsonl); + symlinkSync(outsidePath, answerPath); + writeFileSync( + path.join(timestampDir, 'index.jsonl'), + toJsonl({ + ...RESULT_A, + experiment: 'escaped-answer', + transcript_path: transcriptArtifactPath, + answer_path: answerArtifactPath, + }), + ); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + const res = await app.request( + `/api/runs/${encodeURIComponent(runId)}/evals/test-greeting/transcript`, + ); + + expect(res.status).toBe(200); + const text = await res.text(); + expect(text).not.toContain(secret); + const data = JSON.parse(text) as { + status: string; + content: string; + answer_path: string; + answer_content?: string; + }; + expect(data.status).toBe('ok'); + expect(data.content).toBe(transcriptJsonl); + expect(data.answer_path).toBe(answerArtifactPath); + expect(data.answer_content).toBeUndefined(); + }); + it('does not read transcript bodies for list, detail, or aggregate routes', async () => { const timestamp = '2026-03-25T14-00-00-000Z'; const transcriptArtifactPath = 'demo/test-greeting/outputs/transcript.jsonl'; @@ -2974,6 +3070,37 @@ describe('serve app', () => { ); expect(await downloadRes.text()).toBe(transcriptJsonl); }); + + it('rejects symlinked artifact file reads outside the run workspace', async () => { + const secret = 'outside raw artifact secret'; + const outsidePath = path.join(tempDir, 'outside-response.md'); + writeFileSync(outsidePath, secret); + + const runsDir = path.join(tempDir, '.agentv', 'results', 'runs', 'escaped-file'); + const runId = 'escaped-file::2026-03-25T10-30-00-000Z'; + const timestampDir = path.join(runsDir, '2026-03-25T10-30-00-000Z'); + const artifactPath = 'demo/test-greeting/outputs/response.md'; + const symlinkPath = path.join(timestampDir, artifactPath); + + mkdirSync(path.dirname(symlinkPath), { recursive: true }); + symlinkSync(outsidePath, symlinkPath); + writeFileSync( + path.join(timestampDir, 'index.jsonl'), + toJsonl({ + ...RESULT_A, + experiment: 'escaped-file', + output_path: artifactPath, + }), + ); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + const res = await app.request( + `/api/runs/${encodeURIComponent(runId)}/evals/test-greeting/files/${artifactPath}?raw=1`, + ); + + expect(res.status).toBe(403); + expect(await res.text()).not.toContain(secret); + }); }); // ── GET /api/compare (tag filter) ─────────────────────────────────── From 22269131b5ff4fc9df6c41ee2df9f11012be92ae Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 21 Jun 2026 09:23:15 +0200 Subject: [PATCH 21/21] style: format reviewed results stack --- apps/cli/src/commands/results/remote-metadata.ts | 4 +--- apps/cli/src/commands/results/serve.ts | 4 +++- apps/cli/test/commands/results/combine.test.ts | 4 +--- packages/core/src/evaluation/results-repo.ts | 2 +- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/apps/cli/src/commands/results/remote-metadata.ts b/apps/cli/src/commands/results/remote-metadata.ts index 6c3b1c0b0..21513efae 100644 --- a/apps/cli/src/commands/results/remote-metadata.ts +++ b/apps/cli/src/commands/results/remote-metadata.ts @@ -139,9 +139,7 @@ function equalWatermarks( b: RunOplogWatermark | undefined, ): boolean { return ( - a?.ref === b?.ref && - a?.operation_id === b?.operation_id && - a?.updated_at === b?.updated_at + a?.ref === b?.ref && a?.operation_id === b?.operation_id && a?.updated_at === b?.updated_at ); } diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index b6e8403b1..a846f0beb 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -397,7 +397,9 @@ function resolveRunArtifactPath( function isPathInsideDirectory(baseDir: string, candidatePath: string): boolean { const relative = path.relative(baseDir, candidatePath); - return relative === '' || (!!relative && !relative.startsWith('..') && !path.isAbsolute(relative)); + return ( + relative === '' || (!!relative && !relative.startsWith('..') && !path.isAbsolute(relative)) + ); } function resolveReadableRunArtifactFile( diff --git a/apps/cli/test/commands/results/combine.test.ts b/apps/cli/test/commands/results/combine.test.ts index cac10c0ae..889ef84d0 100644 --- a/apps/cli/test/commands/results/combine.test.ts +++ b/apps/cli/test/commands/results/combine.test.ts @@ -167,9 +167,7 @@ describe('results combine', () => { const [record] = readIndex(combined.manifestPath); expect(record.artifact_dir).toBe('sources/source-1/demo/test-a'); - expect(record.transcript_path).toBe( - 'sources/source-1/demo/test-a/outputs/transcript.jsonl', - ); + expect(record.transcript_path).toBe('sources/source-1/demo/test-a/outputs/transcript.jsonl'); expect(record.raw_provider_log_path).toBe( 'sources/source-1/demo/test-a/outputs/raw/provider.log', ); diff --git a/packages/core/src/evaluation/results-repo.ts b/packages/core/src/evaluation/results-repo.ts index b08bff3af..11f7f5bd2 100644 --- a/packages/core/src/evaluation/results-repo.ts +++ b/packages/core/src/evaluation/results-repo.ts @@ -14,8 +14,8 @@ import path from 'node:path'; import { promisify } from 'node:util'; import { getAgentvDataDir } from '../paths.js'; -import { AGENTV_RESULTS_PRIMARY_REF } from './result-artifact-contract.js'; import type { ResultsConfig } from './loaders/config-loader.js'; +import { AGENTV_RESULTS_PRIMARY_REF } from './result-artifact-contract.js'; const execFileAsync = promisify(execFile); // Local working-tree run workspace inside the eval repo. Local commands