diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts index 859f1ff4b..7798fc554 100644 --- a/apps/cli/src/commands/eval/artifact-writer.ts +++ b/apps/cli/src/commands/eval/artifact-writer.ts @@ -94,6 +94,7 @@ export function buildIndexArtifactEntry( outputPath?: string; answerPath?: string; transcriptPath?: string; + rawProviderLogPath?: string; inputPath?: string; responsePath?: string; taskBundle?: MaterializedTaskBundlePaths; diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 43637c598..fdf4b1869 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -361,6 +361,16 @@ export function trimOutputMessages( return output; } +export function prepareResultForJsonl( + result: EvaluationResult, + options: { readonly outputMessages: number | 'all' }, +): EvaluationResult { + return { + ...result, + output: trimOutputMessages(result.output, options.outputMessages), + }; +} + function normalizeOptions( rawOptions: Record, config?: Awaited>, @@ -1043,11 +1053,7 @@ async function runSingleEvalFile(params: { // Each message is trimmed to { role, content } only (no toolCalls, startTime, etc.). // Full output with tool calls goes to OTel. const resultWithMetadata = withSourceMetadata(result, testFilePath, options); - const trimmedOutput = trimOutputMessages(resultWithMetadata.output, options.outputMessages); - const trimmedResult: EvaluationResult = { - ...resultWithMetadata, - output: trimmedOutput, - }; + const trimmedResult = prepareResultForJsonl(resultWithMetadata, options); await outputWriter.append(trimmedResult); // Export to OTel if exporter is configured (skip batch export when streaming is active) diff --git a/apps/cli/src/commands/inspect/utils.ts b/apps/cli/src/commands/inspect/utils.ts index 6ea549678..574e9de81 100644 --- a/apps/cli/src/commands/inspect/utils.ts +++ b/apps/cli/src/commands/inspect/utils.ts @@ -8,7 +8,7 @@ import { resolveExistingRunPrimaryPath, resolveWorkspaceOrFilePath, } from '../eval/result-layout.js'; -import { loadManifestResults } from '../results/manifest.js'; +import { loadLightweightResults, loadManifestResults } from '../results/manifest.js'; import { ResultRowSchemaError, normalizeResultRow } from '../results/result-row-schema.js'; // ANSI color codes (no dependency needed) @@ -636,7 +636,7 @@ export function listResultFilesFromRunsDir(runsDir: string, limit?: number): Res for (const { filePath, displayName, runId } of limited) { try { const fileStat = statSync(filePath); - const results = loadResultFile(filePath); + const results = loadLightweightResults(filePath); const testCount = results.length; const passCount = results.filter((r) => r.score >= DEFAULT_THRESHOLD).length; diff --git a/apps/cli/src/commands/results/combine-run.ts b/apps/cli/src/commands/results/combine-run.ts index 2e97ff711..e77af2e5d 100644 --- a/apps/cli/src/commands/results/combine-run.ts +++ b/apps/cli/src/commands/results/combine-run.ts @@ -20,7 +20,12 @@ import { } from 'node:fs'; import path from 'node:path'; -import type { EvaluationResult } from '@agentv/core'; +import type { + EvaluationResult, + ResultArtifactPointerWire, + ResultArtifactPointersWire, + TranscriptArtifactPointerWire, +} from '@agentv/core'; import { type BenchmarkArtifact, @@ -305,11 +310,14 @@ function toRunId(cwd: string, runDir: string): string { } const MANIFEST_PATH_FIELDS = [ + 'artifact_dir', 'grading_path', 'timing_path', 'input_path', 'output_path', 'response_path', + 'transcript_path', + 'raw_provider_log_path', 'task_dir', 'eval_path', 'targets_path', @@ -317,6 +325,15 @@ const MANIFEST_PATH_FIELDS = [ 'graders_path', ] as const; +const POINTER_FAMILIES = { + trace: 'traces', + transcript: 'transcripts', +} as const; + +function isSafeRelativeArtifactPath(relativePath: string): boolean { + return !path.isAbsolute(relativePath) && !relativePath.split(/[\\/]+/).includes('..'); +} + function copyReferencedArtifact( sourceBaseDir: string, outputDir: string, @@ -324,7 +341,7 @@ function copyReferencedArtifact( relativePath: string | undefined, ): string | undefined { if (!relativePath) return undefined; - if (path.isAbsolute(relativePath) || relativePath.split(/[\\/]+/).includes('..')) { + if (!isSafeRelativeArtifactPath(relativePath)) { throw new Error(`Unsafe artifact path in source manifest: ${relativePath}`); } const sourcePath = path.join(sourceBaseDir, relativePath); @@ -343,6 +360,71 @@ function copyReferencedArtifact( return rewritten; } +function rewriteArtifactPointer( + pointerName: keyof typeof POINTER_FAMILIES, + pointer: ResultArtifactPointerWire | undefined, + sourceBaseDir: string, + outputDir: string, + sourceIndex: number, +): ResultArtifactPointerWire | undefined { + if (!pointer) { + return undefined; + } + + if (!isSafeRelativeArtifactPath(pointer.path)) { + throw new Error(`Unsafe artifact path in source manifest: ${pointer.path}`); + } + const sourcePath = path.join(sourceBaseDir, pointer.path); + if (!existsSync(sourcePath)) { + return { ...pointer }; + } + + const rewrittenPath = copyReferencedArtifact(sourceBaseDir, outputDir, sourceIndex, pointer.path); + if (!rewrittenPath) { + return { ...pointer }; + } + + const family = pointer.family ?? POINTER_FAMILIES[pointerName]; + return { + ...pointer, + path: rewrittenPath, + key: path.posix.join(family, rewrittenPath), + }; +} + +function rewriteTranscriptArtifactPointer( + pointer: TranscriptArtifactPointerWire | undefined, + sourceBaseDir: string, + outputDir: string, + sourceIndex: number, +): TranscriptArtifactPointerWire | undefined { + return rewriteArtifactPointer('transcript', pointer, sourceBaseDir, outputDir, sourceIndex) as + | TranscriptArtifactPointerWire + | undefined; +} + +function rewriteArtifactPointers( + pointers: ResultArtifactPointersWire | undefined, + sourceBaseDir: string, + outputDir: string, + sourceIndex: number, +): ResultArtifactPointersWire | undefined { + if (!pointers) { + return undefined; + } + + return { + ...pointers, + trace: rewriteArtifactPointer('trace', pointers.trace, sourceBaseDir, outputDir, sourceIndex), + transcript: rewriteTranscriptArtifactPointer( + pointers.transcript, + sourceBaseDir, + outputDir, + sourceIndex, + ), + }; +} + function rewriteAndCopyRecord(row: SelectedRow, outputDir: string): ResultManifestRecord { const sourceBaseDir = path.dirname(row.source.manifestPath); const rewritten: Record = { ...row.record }; @@ -354,6 +436,20 @@ function rewriteAndCopyRecord(row: SelectedRow, outputDir: string): ResultManife row.record[field], ); } + const artifactPointers = rewriteArtifactPointers( + row.record.artifact_pointers, + sourceBaseDir, + outputDir, + row.source.index, + ); + rewritten.artifact_pointers = artifactPointers; + if ( + row.record.transcript_path && + rewritten.transcript_path === row.record.transcript_path && + artifactPointers?.transcript?.path + ) { + rewritten.transcript_path = artifactPointers.transcript.path; + } return rewritten as unknown as ResultManifestRecord; } diff --git a/apps/cli/src/commands/results/export.ts b/apps/cli/src/commands/results/export.ts index 76139169e..69e3f22d7 100644 --- a/apps/cli/src/commands/results/export.ts +++ b/apps/cli/src/commands/results/export.ts @@ -21,14 +21,22 @@ * - To add new per-test workspace files, add them under each test directory. */ +import { readFileSync } from 'node:fs'; import path from 'node:path'; -import { command, oneOf, option, optional, positional, string } from 'cmd-ts'; +import { command, flag, oneOf, option, optional, positional, string } from 'cmd-ts'; -import type { EvaluationResult, ExportDuplicatePolicy } from '@agentv/core'; +import type { EvaluationResult, ExportDuplicatePolicy, IndexArtifactEntry } from '@agentv/core'; import { parseJsonlResults, writeArtifactsFromResults } from '../eval/artifact-writer.js'; import { RESULT_INDEX_FILENAME } from '../eval/result-layout.js'; +import { loadManifestResults } from './manifest.js'; +import { + type ProjectionBundle, + buildProjectionBundle, + serializeProjectionBundle, + writeProjectionBundle, +} from './projection-bundle.js'; import { loadResults as loadSharedResults, resolveSourceFile } from './shared.js'; // ── Export logic ───────────────────────────────────────────────────────── @@ -92,6 +100,36 @@ export async function loadExportSource( return { sourceFile, results }; } +function readIndexArtifactEntries(indexPath: string): IndexArtifactEntry[] { + return readFileSync(indexPath, 'utf8') + .split(/\r?\n/) + .map((line) => line.trim()) + .filter(Boolean) + .map((line) => JSON.parse(line) as IndexArtifactEntry); +} + +export function buildProjectionBundleFromExportedIndex(options: { + readonly sourceFile: string; + readonly outputDir: string; + readonly cwd?: string; + readonly includeRawContent?: boolean; + readonly duplicatePolicy?: ExportDuplicatePolicy; +}): ProjectionBundle { + const indexPath = path.join(options.outputDir, RESULT_INDEX_FILENAME); + const indexRecords = readIndexArtifactEntries(indexPath); + const emittedResults = loadManifestResults(indexPath); + + return buildProjectionBundle(emittedResults, { + sourceFile: options.sourceFile, + runId: deriveExportRunId(options.sourceFile), + cwd: options.cwd, + duplicatePolicy: options.duplicatePolicy, + includeRawContent: options.includeRawContent, + artifactRefStatus: 'emitted', + indexRecords, + }); +} + // ── CLI command ────────────────────────────────────────────────────────── export const resultsExportCommand = command({ @@ -122,10 +160,34 @@ export const resultsExportCommand = command({ description: 'How to handle duplicate projection identities in the output: update (default), skip, or error', }), + projectionBundle: flag({ + long: 'projection-bundle', + description: 'Write a vendor-neutral projection_bundle.json alongside exported artifacts', + }), + dryRun: flag({ + long: 'dry-run', + description: 'Print deterministic projection bundle JSON without writing export artifacts', + }), + includeRawContent: flag({ + long: 'include-raw-content', + description: + 'Include raw prompt, output, and tool payload content in the projection bundle (off by default)', + }), }, - handler: async ({ source, out, dir, duplicatePolicy }) => { + handler: async ({ + source, + out, + dir, + duplicatePolicy, + projectionBundle, + dryRun, + includeRawContent, + }) => { const cwd = dir ?? process.cwd(); const policy = (duplicatePolicy ?? 'update') as ExportDuplicatePolicy; + const shouldWriteProjectionBundle = projectionBundle; + const shouldDryRun = dryRun; + const shouldIncludeRawContent = includeRawContent; try { const { sourceFile, results } = await loadExportSource(source, cwd); @@ -136,14 +198,44 @@ export const resultsExportCommand = command({ : path.resolve(cwd, out) : deriveOutputDir(cwd, sourceFile); + const buildBundle = () => + buildProjectionBundle(results, { + sourceFile, + runId: deriveExportRunId(sourceFile), + cwd, + duplicatePolicy: policy, + includeRawContent: shouldIncludeRawContent, + }); + + if (shouldDryRun) { + process.stdout.write(serializeProjectionBundle(buildBundle())); + return; + } + await writeArtifactsFromResults(results, outputDir, { evalFile: sourceFile, runId: deriveExportRunId(sourceFile), duplicatePolicy: policy, }); + const bundlePath = shouldWriteProjectionBundle + ? await writeProjectionBundle( + buildProjectionBundleFromExportedIndex({ + sourceFile, + outputDir, + cwd, + duplicatePolicy: policy, + includeRawContent: shouldIncludeRawContent, + }), + outputDir, + ) + : undefined; + // Report exported test IDs console.log(`Exported ${results.length} test(s) to ${outputDir}`); + if (bundlePath) { + console.log(`Projection bundle written to ${bundlePath}`); + } for (const result of results) { console.log(` ${result.testId ?? 'unknown'}`); } diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts index 65044552e..253e49127 100644 --- a/apps/cli/src/commands/results/manifest.ts +++ b/apps/cli/src/commands/results/manifest.ts @@ -3,6 +3,7 @@ import path from 'node:path'; import { type EvaluationResult, + type ResultArtifactPointersWire, type TraceSummary, type TranscriptJsonLine, buildTraceFromMessages, @@ -43,6 +44,10 @@ export interface ResultManifestRecord { readonly output_path?: string; readonly answer_path?: string; readonly transcript_path?: string; + readonly raw_provider_log_path?: string; + readonly artifact_pointers?: ResultArtifactPointersWire; + readonly transcript?: ArtifactPointer; + readonly artifacts?: ArtifactPointerMap; readonly response_path?: string; readonly artifact_dir?: string; readonly task_dir?: string; @@ -53,6 +58,36 @@ export interface ResultManifestRecord { readonly metadata?: Record; } +export type ArtifactPointer = + | string + | { + readonly path?: unknown; + readonly artifact_path?: unknown; + readonly relative_path?: unknown; + readonly ref?: unknown; + readonly storage?: unknown; + readonly uri?: unknown; + readonly href?: unknown; + readonly [key: string]: unknown; + }; + +export interface ArtifactPointerMap { + readonly transcript_path?: string; + readonly answer_path?: string; + readonly transcript?: ArtifactPointer; + readonly answer?: ArtifactPointer; + readonly [key: string]: unknown; +} + +export interface ManifestHydrationOptions { + /** + * Defaults to true for report/inspect consumers that need a trace projection. + * Dashboard detail routes set this false so transcript bodies are loaded only + * by the explicit transcript artifact endpoint. + */ + readonly hydrateTranscriptTrace?: boolean; +} + function parseJsonlLines(content: string): T[] { return content .split(/\r?\n/) @@ -114,6 +149,33 @@ function readOptionalJson(baseDir: string, relativePath: string | undefined): } } +function nonEmptyString(value: unknown): string | undefined { + return typeof value === 'string' && value.trim().length > 0 ? value : undefined; +} + +function artifactPointerPath(pointer: ArtifactPointer | undefined): string | undefined { + if (typeof pointer === 'string') { + return nonEmptyString(pointer); + } + if (!pointer) { + return undefined; + } + return ( + nonEmptyString(pointer.path) ?? + nonEmptyString(pointer.artifact_path) ?? + nonEmptyString(pointer.relative_path) + ); +} + +function resolveTranscriptPath(record: ResultManifestRecord): string | undefined { + return ( + record.transcript_path ?? + record.artifact_pointers?.transcript?.path ?? + record.artifacts?.transcript_path ?? + artifactPointerPath(record.transcript ?? record.artifacts?.transcript) + ); +} + function hydrateInput( baseDir: string, record: ResultManifestRecord, @@ -142,13 +204,19 @@ function hydrateOutput( return responseText.trimEnd(); } -function hydrateTrace(baseDir: string, record: ResultManifestRecord): EvaluationResult['trace'] { - const transcriptText = readOptionalText(baseDir, record.transcript_path); - if (transcriptText) { - try { - return traceFromTranscriptJsonLines(parseJsonlLines(transcriptText)); - } catch { - // Fall through to a minimal trace below. +function hydrateTrace( + baseDir: string, + record: ResultManifestRecord, + options: ManifestHydrationOptions, +): EvaluationResult['trace'] { + if (options.hydrateTranscriptTrace !== false) { + const transcriptText = readOptionalText(baseDir, resolveTranscriptPath(record)); + if (transcriptText) { + try { + return traceFromTranscriptJsonLines(parseJsonlLines(transcriptText)); + } catch { + // Fall through to a minimal trace below. + } } } @@ -163,7 +231,11 @@ function hydrateTrace(baseDir: string, record: ResultManifestRecord): Evaluation }); } -function hydrateManifestRecord(baseDir: string, record: ResultManifestRecord): EvaluationResult { +function hydrateManifestRecord( + baseDir: string, + record: ResultManifestRecord, + options: ManifestHydrationOptions, +): EvaluationResult { const grading = readOptionalJson(baseDir, record.grading_path); const timing = readOptionalJson(baseDir, record.timing_path); const testId = record.test_id ?? 'unknown'; @@ -218,7 +290,7 @@ function hydrateManifestRecord(baseDir: string, record: ResultManifestRecord): E costUsd: record.cost_usd, input: hydrateInput(baseDir, record), output: hydrateOutput(baseDir, record) ?? '', - trace: hydrateTrace(baseDir, record), + trace: hydrateTrace(baseDir, record, options), metadata: record.metadata, } as EvaluationResult; } @@ -235,12 +307,15 @@ export function resolveResultSourcePath(source: string, cwd?: string): string { return resolved; } -export function loadManifestResults(sourceFile: string): EvaluationResult[] { +export function loadManifestResults( + sourceFile: string, + options: ManifestHydrationOptions = {}, +): EvaluationResult[] { const resolvedSourceFile = resolveRunManifestPath(sourceFile); const content = readFileSync(resolvedSourceFile, 'utf8'); const records = parseResultRows(content, resolvedSourceFile); const baseDir = path.dirname(resolvedSourceFile); - return records.map((record) => hydrateManifestRecord(baseDir, record)); + return records.map((record) => hydrateManifestRecord(baseDir, record, options)); } export interface LightweightResultRecord { @@ -253,6 +328,7 @@ export interface LightweightResultRecord { readonly scores?: readonly Record[]; readonly executionStatus?: string; readonly error?: string; + readonly costUsd?: number; readonly timestamp?: string; } @@ -269,6 +345,7 @@ export function loadLightweightResults(sourceFile: string): LightweightResultRec scores: record.scores, executionStatus: record.execution_status, error: record.error, + costUsd: record.cost_usd, timestamp: record.timestamp, })); } diff --git a/apps/cli/src/commands/results/projection-bundle.ts b/apps/cli/src/commands/results/projection-bundle.ts new file mode 100644 index 000000000..d01cb12c8 --- /dev/null +++ b/apps/cli/src/commands/results/projection-bundle.ts @@ -0,0 +1,405 @@ +/** + * Vendor-neutral projection bundle for completed AgentV runs. + * + * This file builds a deterministic, local JSON contract that adapter workers + * can consume without calling vendor SDKs. The bundle keeps AgentV artifacts as + * the source of truth, includes metadata-only OpenInference-shaped spans by + * default, and requires explicit opt-in before raw prompt/output/tool payloads + * are copied into the bundle. + */ + +import { createHash } from 'node:crypto'; +import { mkdir, writeFile } from 'node:fs/promises'; +import path from 'node:path'; + +import { + type EvaluationResult, + type ExportDuplicatePolicy, + type IndexArtifactEntry, + type ProjectionIdentityWire, + type TraceEnvelopeCaptureWire, + type TraceEnvelopeConversionWarningWire, + type TraceEnvelopeScoreWire, + type TraceEnvelopeWire, + buildResultIndexArtifact, + buildTraceEnvelopeFromEvaluationResult, + toTraceEnvelopeWire, +} from '@agentv/core'; + +export const PROJECTION_BUNDLE_FILENAME = 'projection_bundle.json'; +export const PROJECTION_BUNDLE_SCHEMA_VERSION = 'agentv.projection_bundle.v1'; + +type JsonRecord = Record; + +export interface ProjectionBundle { + readonly schema_version: typeof PROJECTION_BUNDLE_SCHEMA_VERSION; + readonly bundle_id: string; + readonly created_at: string; + readonly source: { + readonly kind: 'agentv_run'; + readonly path: string; + readonly run_id: string; + readonly result_count: number; + }; + readonly content_policy: { + readonly raw_content: 'excluded' | 'included'; + readonly raw_content_opt_in: boolean; + readonly default_capture: 'metadata' | 'full'; + readonly backend_anonymizer_boundary: 'adapter'; + }; + readonly capture_summary: TraceEnvelopeCaptureWire; + readonly entries: readonly ProjectionBundleEntry[]; + readonly conversion_warnings?: readonly TraceEnvelopeConversionWarningWire[]; +} + +export interface ProjectionBundleEntry { + readonly projection_id: string; + readonly projection_identity: ProjectionIdentityWire; + readonly eval: TraceEnvelopeWire['eval']; + readonly artifact_refs: ProjectionBundleArtifactRefs; + readonly trace: { + readonly format: TraceEnvelopeWire['trace']['format']; + readonly trace_id: string; + readonly root_span_id: string; + readonly span_count: number; + readonly envelope_ref?: string; + }; + readonly trace_envelope: TraceEnvelopeWire; + readonly feedback: { + readonly source: 'agentv_grading_artifacts'; + readonly result_score: number; + readonly execution_status?: string; + readonly grading_path?: string; + readonly timing_path?: string; + readonly assertion_count: number; + readonly scores?: readonly TraceEnvelopeScoreWire[]; + }; + readonly capture: TraceEnvelopeCaptureWire; + readonly conversion_warnings?: readonly TraceEnvelopeConversionWarningWire[]; + readonly raw_content?: { + readonly input?: unknown; + readonly output?: string; + readonly trace_messages?: unknown; + }; +} + +export type ProjectionBundleArtifactRefs = Partial< + Pick< + IndexArtifactEntry, + | 'artifact_dir' + | 'grading_path' + | 'timing_path' + | 'input_path' + | 'output_path' + | 'answer_path' + | 'response_path' + | 'transcript_path' + | 'task_dir' + | 'eval_path' + | 'targets_path' + | 'files_path' + | 'graders_path' + > & { readonly trace_path: string } +> & { + readonly status: 'planned_export' | 'emitted'; +}; + +export interface BuildProjectionBundleOptions { + readonly sourceFile: string; + readonly runId: string; + readonly cwd?: string; + readonly includeRawContent?: boolean; + readonly duplicatePolicy?: ExportDuplicatePolicy; + readonly artifactRefStatus?: ProjectionBundleArtifactRefs['status']; + readonly indexRecords?: readonly IndexArtifactEntry[]; +} + +function dropUndefined(value: T): T { + return Object.fromEntries( + Object.entries(value).filter(([, entryValue]) => entryValue !== undefined), + ) as T; +} + +function toPortablePath(filePath: string, cwd?: string): string { + const absolutePath = path.resolve(filePath); + const absoluteCwd = path.resolve(cwd ?? process.cwd()); + const relative = path.relative(absoluteCwd, absolutePath); + const portable = + relative && !relative.startsWith('..') && !path.isAbsolute(relative) ? relative : absolutePath; + return portable.split(path.sep).join('/'); +} + +function stableDate(value: string | undefined): Date { + const parsed = value ? Date.parse(value) : Number.NaN; + return Number.isFinite(parsed) ? new Date(parsed) : new Date(0); +} + +function bundleCreatedAt(results: readonly EvaluationResult[]): string { + const timestamps = results + .map((result) => stableDate(result.timestamp).toISOString()) + .sort((a, b) => a.localeCompare(b)); + return timestamps[0] ?? new Date(0).toISOString(); +} + +function shortHash(parts: readonly string[], length = 20): string { + return createHash('sha256').update(parts.join('\n')).digest('hex').slice(0, length); +} + +function tracePathFor(indexEntry: IndexArtifactEntry): string | undefined { + return indexEntry.artifact_dir + ? path.posix.join(indexEntry.artifact_dir, 'outputs', 'trace.json') + : undefined; +} + +function artifactRefs( + indexEntry: IndexArtifactEntry, + options: { + readonly includeRawContent: boolean; + readonly status: ProjectionBundleArtifactRefs['status']; + }, +): ProjectionBundleArtifactRefs { + const metadataRefs = dropUndefined({ + status: options.status, + timing_path: indexEntry.timing_path, + }); + + if (!options.includeRawContent) { + return metadataRefs; + } + + return dropUndefined({ + ...metadataRefs, + artifact_dir: indexEntry.artifact_dir, + grading_path: indexEntry.grading_path, + input_path: indexEntry.input_path, + output_path: indexEntry.output_path, + answer_path: indexEntry.answer_path, + response_path: indexEntry.response_path, + transcript_path: indexEntry.transcript_path, + trace_path: tracePathFor(indexEntry), + task_dir: indexEntry.task_dir, + eval_path: indexEntry.eval_path, + targets_path: indexEntry.targets_path, + files_path: indexEntry.files_path, + graders_path: indexEntry.graders_path, + }); +} + +function removeTranscriptMessageMetadata(envelope: TraceEnvelopeWire): TraceEnvelopeWire { + return { + ...envelope, + trace: { + ...envelope.trace, + spans: envelope.trace.spans.map((span) => ({ + ...span, + events: span.events?.map((event) => { + const transcriptMessage = event.attributes?.['agentv.transcript.message']; + if ( + !transcriptMessage || + typeof transcriptMessage !== 'object' || + Array.isArray(transcriptMessage) + ) { + return event; + } + const { metadata: _metadata, ...safeMessage } = transcriptMessage as JsonRecord; + return { + ...event, + attributes: { + ...event.attributes, + 'agentv.transcript.message': safeMessage, + }, + }; + }), + })), + }, + }; +} + +function safeEnvelope( + envelope: TraceEnvelopeWire, + options: { includeRawContent: boolean }, +): TraceEnvelopeWire { + if (options.includeRawContent) { + return envelope; + } + + const withoutRawEvidence = removeTranscriptMessageMetadata({ + ...envelope, + source: { + ...envelope.source, + metadata: undefined, + }, + artifacts: undefined, + scores: envelope.scores?.map(({ evidence: _evidence, ...score }) => score), + }); + + return JSON.parse(JSON.stringify(withoutRawEvidence)) as TraceEnvelopeWire; +} + +function safeScores( + scores: readonly TraceEnvelopeScoreWire[] | undefined, + options: { includeRawContent: boolean }, +): readonly TraceEnvelopeScoreWire[] | undefined { + if (!scores) { + return undefined; + } + return options.includeRawContent + ? scores + : scores.map(({ evidence: _evidence, ...score }) => score); +} + +function captureOptions(includeRawContent: boolean) { + return includeRawContent + ? { content: 'full' as const, redactionLevel: 'none' as const, redactedFields: [] } + : undefined; +} + +function rawContent(result: EvaluationResult): ProjectionBundleEntry['raw_content'] { + return dropUndefined({ + input: result.input, + output: result.output, + trace_messages: result.trace.messages, + }); +} + +function buildEntry( + result: EvaluationResult, + options: BuildProjectionBundleOptions, + indexRecord?: IndexArtifactEntry, +): ProjectionBundleEntry { + const includeRawContent = options.includeRawContent ?? false; + const sourcePath = toPortablePath(options.sourceFile, options.cwd); + const plannedIndexEntry = buildResultIndexArtifact(result); + const envelope = buildTraceEnvelopeFromEvaluationResult(result, { + evalPath: sourcePath, + runId: options.runId, + source: { kind: 'agentv_run', path: sourcePath, format: 'agentv_result' }, + artifacts: { + trace_path: tracePathFor(indexRecord ?? plannedIndexEntry), + answer_path: result.output.length > 0 ? 'outputs/answer.md' : undefined, + response_path: result.output.length > 0 ? 'outputs/response.md' : undefined, + }, + duplicatePolicy: options.duplicatePolicy, + capture: captureOptions(includeRawContent), + now: () => stableDate(result.timestamp), + }); + const projectionIdentity = envelope.projectionIdentity; + if (!projectionIdentity) { + throw new Error(`Result ${result.testId ?? 'unknown'} is missing projection identity`); + } + + const indexEntry = + indexRecord ?? + buildResultIndexArtifact(result, undefined, { + projectionIdentity, + duplicatePolicy: options.duplicatePolicy, + }); + const refs = artifactRefs(indexEntry, { + includeRawContent, + status: options.artifactRefStatus ?? 'planned_export', + }); + const safeEnvelopeWire = safeEnvelope(toTraceEnvelopeWire(envelope), { includeRawContent }); + const projectionIdentityWire = + indexEntry.projection_identity ?? safeEnvelopeWire.projection_identity; + if (!projectionIdentityWire) { + throw new Error(`Result ${result.testId ?? 'unknown'} is missing projection identity`); + } + const envelopeWire = { + ...safeEnvelopeWire, + projection_identity: projectionIdentityWire, + }; + const scores = safeScores(envelopeWire.scores, { includeRawContent }); + + const feedback: ProjectionBundleEntry['feedback'] = dropUndefined({ + source: 'agentv_grading_artifacts', + result_score: result.score, + execution_status: result.executionStatus, + grading_path: refs.grading_path, + timing_path: refs.timing_path, + assertion_count: result.assertions?.length ?? 0, + scores, + }); + + return { + projection_id: projectionIdentity.id, + projection_identity: projectionIdentityWire, + eval: envelopeWire.eval, + artifact_refs: refs, + trace: dropUndefined({ + format: envelopeWire.trace.format, + trace_id: envelopeWire.trace.trace_id, + root_span_id: envelopeWire.trace.root_span_id, + span_count: envelopeWire.trace.spans.length, + envelope_ref: refs.trace_path, + }), + trace_envelope: envelopeWire, + feedback, + capture: envelopeWire.capture, + ...(envelopeWire.conversion_warnings + ? { conversion_warnings: envelopeWire.conversion_warnings } + : {}), + ...(includeRawContent ? { raw_content: rawContent(result) } : {}), + }; +} + +export function buildProjectionBundle( + results: readonly EvaluationResult[], + options: BuildProjectionBundleOptions, +): ProjectionBundle { + if (results.length === 0) { + throw new Error(`No results found in ${options.sourceFile}`); + } + + const entries = results.map((result, index) => + buildEntry(result, options, options.indexRecords?.[index]), + ); + const includeRawContent = options.includeRawContent ?? false; + const artifactRefStatus = options.artifactRefStatus ?? 'planned_export'; + const conversionWarnings = entries.flatMap((entry) => entry.conversion_warnings ?? []); + const bundleId = `projection-bundle-${shortHash([ + PROJECTION_BUNDLE_SCHEMA_VERSION, + toPortablePath(options.sourceFile, options.cwd), + options.runId, + artifactRefStatus, + includeRawContent ? 'raw' : 'metadata', + ...entries.map((entry) => entry.projection_id), + ])}`; + + return { + schema_version: PROJECTION_BUNDLE_SCHEMA_VERSION, + bundle_id: bundleId, + created_at: bundleCreatedAt(results), + source: { + kind: 'agentv_run', + path: toPortablePath(options.sourceFile, options.cwd), + run_id: options.runId, + result_count: results.length, + }, + content_policy: { + raw_content: includeRawContent ? 'included' : 'excluded', + raw_content_opt_in: includeRawContent, + default_capture: includeRawContent ? 'full' : 'metadata', + backend_anonymizer_boundary: 'adapter', + }, + capture_summary: entries[0]?.capture ?? { + content: includeRawContent ? 'full' : 'metadata', + redaction_level: includeRawContent ? 'none' : 'partial', + }, + entries, + ...(conversionWarnings.length > 0 ? { conversion_warnings: conversionWarnings } : {}), + }; +} + +export function serializeProjectionBundle(bundle: ProjectionBundle): string { + return `${JSON.stringify(bundle, null, 2)}\n`; +} + +export async function writeProjectionBundle( + bundle: ProjectionBundle, + outputDir: string, +): Promise { + const bundlePath = path.join(outputDir, PROJECTION_BUNDLE_FILENAME); + await mkdir(outputDir, { recursive: true }); + await writeFile(bundlePath, serializeProjectionBundle(bundle), 'utf8'); + return bundlePath; +} diff --git a/apps/cli/src/commands/results/remote-metadata.ts b/apps/cli/src/commands/results/remote-metadata.ts index 00f57de7b..21513efae 100644 --- a/apps/cli/src/commands/results/remote-metadata.ts +++ b/apps/cli/src/commands/results/remote-metadata.ts @@ -17,6 +17,13 @@ import { execFileSync } from 'node:child_process'; import { existsSync, mkdirSync, readFileSync, rmSync, writeFileSync } from 'node:fs'; import path from 'node:path'; +import { + type RunOplogWatermark, + buildRunIdFromRelativePath, + createRunTagsSetOperation, + normalizeRunOplogWatermark, + watermarkFromRunOperation, +} from './run-oplog.js'; import { RUN_TAGS_FILENAME, normalizeTags } from './run-tags.js'; const RESULTS_RUNS_DIR = 'runs'; @@ -25,6 +32,7 @@ const REMOTE_METADATA_RUNS_DIR = path.join('metadata', 'runs'); interface TagsFile { readonly tags: string[]; readonly updatedAt?: string; + readonly oplogWatermark?: RunOplogWatermark; } interface RemoteRunMetadataPaths { @@ -48,6 +56,7 @@ export interface RemoteRunTagState { readonly pendingTags?: string[]; readonly dirty: boolean; readonly updatedAt?: string; + readonly oplogWatermark: RunOplogWatermark; readonly metadataPath: string; } @@ -112,9 +121,11 @@ function parseTagsFile(content: string): TagsFile | undefined { const record = parsed as Record; if (!Array.isArray(record.tags)) return undefined; const tags = record.tags.filter((tag): tag is string => typeof tag === 'string'); + const updatedAt = typeof record.updated_at === 'string' ? record.updated_at : undefined; return { tags, - updatedAt: typeof record.updated_at === 'string' ? record.updated_at : undefined, + updatedAt, + oplogWatermark: normalizeRunOplogWatermark(record.oplog_watermark, updatedAt), }; } @@ -123,6 +134,26 @@ function equalTags(a: readonly string[], b: readonly string[]): boolean { return a.every((tag, index) => tag === b[index]); } +function equalWatermarks( + a: RunOplogWatermark | undefined, + b: RunOplogWatermark | undefined, +): boolean { + return ( + a?.ref === b?.ref && a?.operation_id === b?.operation_id && a?.updated_at === b?.updated_at + ); +} + +function equalTagFiles(a: TagsFile | undefined, b: TagsFile | undefined): boolean { + if (a === undefined || b === undefined) { + return a === b; + } + return ( + equalTags(a.tags, b.tags) && + a.updatedAt === b.updatedAt && + equalWatermarks(a.oplogWatermark, b.oplogWatermark) + ); +} + function resolveComparisonRef(repoDir: string): string | undefined { const upstream = tryRunGit(repoDir, [ 'rev-parse', @@ -188,7 +219,14 @@ function readRemoteRunTagsContext(repoDir: string, manifestPath: string): Remote function toRemoteRunTagState(context: RemoteRunTagsContext): RemoteRunTagState { const remoteTags = context.baseOverlayTags?.tags ?? context.artifactTags?.tags ?? []; const effectiveTags = context.localOverlayTags?.tags ?? remoteTags; - const dirty = !equalTags(effectiveTags, remoteTags); + const dirty = context.localOverlayTags + ? !equalTagFiles(context.localOverlayTags, context.baseOverlayTags) + : !equalTags(effectiveTags, remoteTags); + const watermark = + context.localOverlayTags?.oplogWatermark ?? + context.baseOverlayTags?.oplogWatermark ?? + context.artifactTags?.oplogWatermark ?? + normalizeRunOplogWatermark(undefined); return { tags: effectiveTags, @@ -199,6 +237,7 @@ function toRemoteRunTagState(context: RemoteRunTagsContext): RemoteRunTagState { context.localOverlayTags?.updatedAt ?? context.baseOverlayTags?.updatedAt ?? context.artifactTags?.updatedAt, + oplogWatermark: watermark, metadataPath: context.paths.overlayTagsPath, }; } @@ -235,14 +274,25 @@ export function writeRemoteRunTags( const context = readRemoteRunTagsContext(repoDir, manifestPath); const remoteTags = context.baseOverlayTags?.tags ?? context.artifactTags?.tags ?? []; - if (equalTags(cleaned, remoteTags) && context.baseOverlayTags === undefined) { + if ( + cleaned.length > 0 && + equalTags(cleaned, remoteTags) && + context.baseOverlayTags === undefined + ) { rmSync(context.paths.overlayTagsPath, { force: true }); return readRemoteRunTags(repoDir, manifestPath); } + const operation = createRunTagsSetOperation({ + runId: buildRunIdFromRelativePath(context.paths.runRelativePath), + runPath: context.paths.runRelativePath, + tags: cleaned, + actor: { kind: 'dashboard' }, + }); const entry = { tags: cleaned, - updated_at: new Date().toISOString(), + updated_at: operation.authored_at, + oplog_watermark: watermarkFromRunOperation(operation), }; mkdirSync(path.dirname(context.paths.overlayTagsPath), { recursive: true }); writeFileSync(context.paths.overlayTagsPath, `${JSON.stringify(entry, null, 2)}\n`, 'utf8'); diff --git a/apps/cli/src/commands/results/run-oplog.ts b/apps/cli/src/commands/results/run-oplog.ts new file mode 100644 index 000000000..c307e5841 --- /dev/null +++ b/apps/cli/src/commands/results/run-oplog.ts @@ -0,0 +1,141 @@ +import { randomUUID } from 'node:crypto'; + +/** + * Minimal run operation-log contract used by Dashboard read models. + * + * The raw oplog storage branch is intentionally not implemented here. This + * module only centralizes the ref name, a small typed operation envelope for + * tag replacement, and the materialized final-state shape that readers consume. + */ + +export const RUN_OPLOG_REF = 'agentv/oplog/v1'; +export const RUN_OPERATION_SCHEMA_VERSION = 'agentv.run_operation.v1'; + +export type RunFinalStateLifecycle = 'active' | 'hidden' | 'deleted'; + +export interface RunOplogWatermark { + readonly ref: typeof RUN_OPLOG_REF; + readonly operation_id?: string; + readonly updated_at?: string; +} + +export interface RunFinalState { + readonly lifecycle: RunFinalStateLifecycle; + readonly tags: string[]; +} + +export interface RunReadStateFields { + readonly final_state: RunFinalState; + readonly oplog_watermark: RunOplogWatermark; +} + +export type RunOperationActorKind = 'dashboard' | 'cli' | 'ci' | 'agent' | 'unknown'; + +export interface RunOperationActor { + readonly kind: RunOperationActorKind; + readonly id?: string; +} + +export interface RunOperationSubject { + readonly run_id: string; + readonly run_path?: string; +} + +export interface RunTagsSetOperation { + readonly schema_version: typeof RUN_OPERATION_SCHEMA_VERSION; + readonly operation_id: string; + readonly operation_type: 'run.tags.set'; + readonly authored_at: string; + readonly actor: RunOperationActor; + readonly subject: RunOperationSubject; + readonly payload: { + readonly tags: string[]; + }; +} + +export type RunOperationEnvelope = RunTagsSetOperation; + +export function buildRunIdFromRelativePath(relativeRunPath: string): string { + const segments = relativeRunPath.split(/[\\/]+/).filter(Boolean); + if (segments.length >= 2) { + const experiment = segments.slice(0, -1).join('/'); + const runName = segments.at(-1) ?? relativeRunPath; + return experiment === 'default' ? runName : `${experiment}::${runName}`; + } + return segments[0] ?? relativeRunPath; +} + +export function createRunTagsSetOperation(input: { + readonly runId: string; + readonly runPath?: string; + readonly tags: readonly string[]; + readonly actor?: RunOperationActor; + readonly authoredAt?: string; + readonly operationId?: string; +}): RunTagsSetOperation { + return { + schema_version: RUN_OPERATION_SCHEMA_VERSION, + operation_id: input.operationId ?? randomUUID(), + operation_type: 'run.tags.set', + authored_at: input.authoredAt ?? new Date().toISOString(), + actor: input.actor ?? { kind: 'unknown' }, + subject: { + run_id: input.runId, + ...(input.runPath ? { run_path: input.runPath } : {}), + }, + payload: { + tags: [...input.tags], + }, + }; +} + +export function watermarkFromRunOperation(operation: RunOperationEnvelope): RunOplogWatermark { + return { + ref: RUN_OPLOG_REF, + operation_id: operation.operation_id, + updated_at: operation.authored_at, + }; +} + +export function normalizeRunOplogWatermark( + input: unknown, + fallbackUpdatedAt?: string, +): RunOplogWatermark { + if (input && typeof input === 'object') { + const record = input as Record; + const operationId = record.operation_id; + const updatedAt = record.updated_at; + return { + ref: RUN_OPLOG_REF, + ...(typeof operationId === 'string' && operationId ? { operation_id: operationId } : {}), + ...(typeof updatedAt === 'string' && updatedAt + ? { updated_at: updatedAt } + : fallbackUpdatedAt + ? { updated_at: fallbackUpdatedAt } + : {}), + }; + } + + return { + ref: RUN_OPLOG_REF, + ...(fallbackUpdatedAt ? { updated_at: fallbackUpdatedAt } : {}), + }; +} + +export function materializeRunState(input?: { + readonly lifecycle?: RunFinalStateLifecycle; + readonly tags?: readonly string[]; + readonly watermark?: RunOplogWatermark; + readonly updatedAt?: string; +}): RunReadStateFields { + const tags = [...(input?.tags ?? [])]; + const watermark = input?.watermark ?? normalizeRunOplogWatermark(undefined, input?.updatedAt); + + return { + final_state: { + lifecycle: input?.lifecycle ?? 'active', + tags, + }, + oplog_watermark: watermark, + }; +} diff --git a/apps/cli/src/commands/results/run-tags.ts b/apps/cli/src/commands/results/run-tags.ts index 3714f73d7..9464aa068 100644 --- a/apps/cli/src/commands/results/run-tags.ts +++ b/apps/cli/src/commands/results/run-tags.ts @@ -7,20 +7,24 @@ * * Wire format (stored on disk): * ```json - * { "tags": ["baseline", "v2-prompt"], "updated_at": "2026-04-10T00:00:00.000Z" } + * { + * "tags": ["baseline", "v2-prompt"], + * "updated_at": "2026-04-10T00:00:00.000Z", + * "oplog_watermark": { "ref": "agentv/oplog/v1" } + * } * ``` * * Used by the Dashboard compare API so users can retroactively tag runs - * without changing the eval YAML or the run manifest itself. This mirrors - * the Langfuse / W&B / GitHub `tags` pattern — a mutable multi-valued - * list of free-form labels that lives alongside the immutable run_id. + * without changing the eval YAML or the run manifest itself. Tags are a + * mutable multi-valued list of free-form labels that lives alongside the + * immutable run_id. * * Validation rules: * - Each tag is 1–60 characters after trimming * - No control characters (\n, \t, DEL, etc.) * - Tags are deduplicated case-sensitively * - A run can have at most 20 tags - * - Writing an empty array removes the sidecar file + * - Writing an empty array records a clear/tombstone state with a watermark * * To extend (e.g. add colored labels or descriptions): add optional fields * to `RunTagsFile` and keep the schema additive so older files still parse. @@ -29,6 +33,14 @@ import { existsSync, readFileSync, unlinkSync, writeFileSync } from 'node:fs'; import path from 'node:path'; +import { + type RunOplogWatermark, + buildRunIdFromRelativePath, + createRunTagsSetOperation, + normalizeRunOplogWatermark, + watermarkFromRunOperation, +} from './run-oplog.js'; + export const RUN_TAGS_FILENAME = 'tags.json'; /** Maximum number of tags per run. */ @@ -42,6 +54,8 @@ export interface RunTagsFile { tags: string[]; /** ISO-8601 timestamp of last update. */ updated_at: string; + /** Watermark for the operation-log state this materialized tag list reflects. */ + oplog_watermark?: RunOplogWatermark; } /** Resolve the tags sidecar path given a run manifest (index.jsonl) path. */ @@ -49,6 +63,16 @@ export function runTagsPath(manifestPath: string): string { return path.join(path.dirname(manifestPath), RUN_TAGS_FILENAME); } +function inferRunRelativePath(manifestPath: string): string { + const runDir = path.dirname(manifestPath); + const segments = runDir.split(path.sep); + const runsIndex = segments.lastIndexOf('runs'); + if (runsIndex >= 0 && runsIndex < segments.length - 1) { + return segments.slice(runsIndex + 1).join('/'); + } + return path.basename(runDir); +} + /** Read the tags for a run. Returns `undefined` if missing or unreadable. */ export function readRunTags(manifestPath: string): RunTagsFile | undefined { const fp = runTagsPath(manifestPath); @@ -61,10 +85,11 @@ export function readRunTags(manifestPath: string): RunTagsFile | undefined { const tags = record.tags.filter( (t): t is string => typeof t === 'string' && t.trim().length > 0, ); - if (tags.length === 0) return undefined; + const updatedAt = typeof record.updated_at === 'string' ? record.updated_at : ''; return { tags, - updated_at: typeof record.updated_at === 'string' ? record.updated_at : '', + updated_at: updatedAt, + oplog_watermark: normalizeRunOplogWatermark(record.oplog_watermark, updatedAt || undefined), }; } catch { return undefined; @@ -72,18 +97,22 @@ export function readRunTags(manifestPath: string): RunTagsFile | undefined { } /** - * Write tags for a run. Replaces any existing tags. Pass an empty array - * to remove the sidecar entirely. + * Write tags for a run. Replaces any existing tags. Pass an empty array to + * record that tags were intentionally cleared while preserving the watermark. */ -export function writeRunTags(manifestPath: string, tags: readonly string[]): RunTagsFile | null { +export function writeRunTags(manifestPath: string, tags: readonly string[]): RunTagsFile { const cleaned = normalizeTags(tags); - if (cleaned.length === 0) { - deleteRunTags(manifestPath); - return null; - } + const runPath = inferRunRelativePath(manifestPath); + const operation = createRunTagsSetOperation({ + runId: buildRunIdFromRelativePath(runPath), + runPath, + tags: cleaned, + actor: { kind: 'dashboard' }, + }); const entry: RunTagsFile = { tags: cleaned, - updated_at: new Date().toISOString(), + updated_at: operation.authored_at, + oplog_watermark: watermarkFromRunOperation(operation), }; writeFileSync(runTagsPath(manifestPath), `${JSON.stringify(entry, null, 2)}\n`, 'utf8'); return entry; diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 839fc68a5..a846f0beb 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -34,7 +34,15 @@ * - createApp(results, cwd) — Hono app factory */ -import { existsSync, mkdirSync, readFileSync, readdirSync, statSync, writeFileSync } from 'node:fs'; +import { + existsSync, + mkdirSync, + readFileSync, + readdirSync, + realpathSync, + statSync, + writeFileSync, +} from 'node:fs'; import { homedir } from 'node:os'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; @@ -67,7 +75,12 @@ import { } from './combine-run.js'; import { deleteLocalRun } from './delete-run.js'; import { getActiveRunStatus, getActiveRunTarget, registerEvalRoutes } from './eval-runner.js'; -import { loadLightweightResults, loadManifestResults, parseResultManifest } from './manifest.js'; +import { + type ResultManifestRecord, + loadLightweightResults, + loadManifestResults, + parseResultManifest, +} from './manifest.js'; import { type SourcedResultFileMeta, clearRemoteRunTags, @@ -79,7 +92,13 @@ import { setRemoteRunTags, syncRemoteResults, } from './remote.js'; -import { deleteRunTags, readRunTags, writeRunTags } from './run-tags.js'; +import { + type RunFinalState, + type RunOplogWatermark, + type RunReadStateFields, + materializeRunState, +} from './run-oplog.js'; +import { readRunTags, writeRunTags } from './run-tags.js'; import { type StudioConfig, loadStudioConfig, saveStudioConfig } from './studio-config.js'; // ── Source resolution ──────────────────────────────────────────────────── @@ -285,6 +304,152 @@ function contentDispositionFilename(filePath: string): string { return path.basename(filePath).replace(/["\\\r\n]/g, '_'); } +function isRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value); +} + +function nonEmptyString(value: unknown): string | undefined { + return typeof value === 'string' && value.trim() ? value.trim() : undefined; +} + +function artifactPointerPath(pointer: unknown): string | undefined { + if (typeof pointer === 'string') return nonEmptyString(pointer); + if (!isRecord(pointer)) return undefined; + return ( + nonEmptyString(pointer.path) ?? + nonEmptyString(pointer.artifact_path) ?? + nonEmptyString(pointer.relative_path) + ); +} + +function artifactPointerDescription(pointer: unknown): string | undefined { + if (typeof pointer === 'string') return pointer; + if (!isRecord(pointer)) return undefined; + const ref = nonEmptyString(pointer.ref); + const storage = nonEmptyString(pointer.storage); + const uri = nonEmptyString(pointer.uri) ?? nonEmptyString(pointer.href); + const pointerPath = artifactPointerPath(pointer); + const parts = [ + ref ? `ref ${ref}` : undefined, + storage ? `storage ${storage}` : undefined, + uri ? `uri ${uri}` : undefined, + pointerPath ? `path ${pointerPath}` : undefined, + ].filter((part): part is string => part !== undefined); + return parts.length > 0 ? parts.join(', ') : undefined; +} + +function artifactPointerRef(pointer: unknown): string | undefined { + return isRecord(pointer) ? nonEmptyString(pointer.ref) : undefined; +} + +interface ResolvedArtifactPointer { + readonly path?: string; + readonly description?: string; + readonly ref?: string; + readonly unsupportedReason?: string; +} + +function resolveRecordArtifactPointer( + record: ResultManifestRecord, + kind: 'transcript' | 'answer', +): ResolvedArtifactPointer { + const directPath = + kind === 'transcript' + ? (record.transcript_path ?? record.artifacts?.transcript_path) + : (record.answer_path ?? record.artifacts?.answer_path ?? record.output_path); + if (directPath) { + return { path: directPath, description: directPath }; + } + + const pointer = + kind === 'transcript' + ? (record.transcript ?? record.artifacts?.transcript ?? record.artifact_pointers?.transcript) + : record.artifacts?.answer; + const pointerPath = artifactPointerPath(pointer); + const description = artifactPointerDescription(pointer); + const ref = artifactPointerRef(pointer); + if (pointerPath) { + return { path: pointerPath, description, ref }; + } + if (pointer) { + return { + description, + ref, + unsupportedReason: description + ? `${kind} artifact pointer does not include a local path (${description}).` + : `${kind} artifact pointer does not include a local path.`, + }; + } + return {}; +} + +function resolveRunArtifactPath( + baseDir: string, + relativePath: string, +): { absolutePath?: string; error?: string } { + const absolutePath = path.resolve(baseDir, relativePath); + const resolvedBase = path.resolve(baseDir); + if (!isPathInsideDirectory(resolvedBase, absolutePath)) { + return { error: 'Artifact path is outside the run workspace.' }; + } + return { absolutePath }; +} + +function isPathInsideDirectory(baseDir: string, candidatePath: string): boolean { + const relative = path.relative(baseDir, candidatePath); + return ( + relative === '' || (!!relative && !relative.startsWith('..') && !path.isAbsolute(relative)) + ); +} + +function resolveReadableRunArtifactFile( + baseDir: string, + relativePath: string, +): { absolutePath?: string; error?: string } { + const resolved = resolveRunArtifactPath(baseDir, relativePath); + if (!resolved.absolutePath) return { error: resolved.error }; + + let realBase: string; + let realArtifact: string; + try { + realBase = realpathSync(baseDir); + realArtifact = realpathSync(resolved.absolutePath); + } catch { + return {}; + } + + if (!isPathInsideDirectory(realBase, realArtifact)) { + return { error: 'Artifact path is outside the run workspace.' }; + } + + try { + if (!statSync(realArtifact).isFile()) { + return {}; + } + } catch { + return {}; + } + + return { absolutePath: realArtifact }; +} + +function readOptionalRunArtifactText( + baseDir: string, + artifact: ResolvedArtifactPointer, +): string | undefined { + if (!artifact.path) return undefined; + const resolved = resolveReadableRunArtifactFile(baseDir, artifact.path); + if (!resolved.absolutePath) return undefined; + return readFileSync(resolved.absolutePath, 'utf8'); +} + +function missingTranscriptMessage(): string { + return [ + 'This result does not include canonical outputs/transcript.jsonl metadata.', + 'Dashboard does not parse response.md or markdown transcripts for this view.', + ].join(' '); +} + function stripHeavyFields(results: readonly EvaluationResult[]) { return results.map((r) => { const { requests, trace, ...rest } = r as EvaluationResult & Record; @@ -316,6 +481,8 @@ interface RunTagFields { readonly remote_tags?: string[]; readonly pending_tags?: string[]; readonly metadata_dirty?: boolean; + readonly final_state: RunFinalState; + readonly oplog_watermark: RunOplogWatermark; } // biome-ignore lint/suspicious/noExplicitAny: Hono Context generic varies by route @@ -340,7 +507,15 @@ async function readRunTagFields( ): Promise { if (meta.source === 'local') { const tagsEntry = readRunTags(meta.path); - return tagsEntry ? { tags: tagsEntry.tags } : {}; + const runState = materializeRunState({ + tags: tagsEntry?.tags ?? [], + watermark: tagsEntry?.oplog_watermark, + updatedAt: tagsEntry?.updated_at || undefined, + }); + return { + ...(tagsEntry ? { tags: tagsEntry.tags } : {}), + ...runState, + }; } const state = await readRemoteRunTagState(searchDir, meta, projectId); @@ -349,6 +524,7 @@ async function readRunTagFields( tags: [], remote_tags: [], metadata_dirty: false, + ...materializeRunState({ tags: [] }), }; } @@ -357,6 +533,11 @@ async function readRunTagFields( remote_tags: state.remoteTags, metadata_dirty: state.dirty, ...(state.dirty && { pending_tags: state.pendingTags ?? state.tags }), + ...materializeRunState({ + tags: state.tags, + watermark: state.oplogWatermark, + updatedAt: state.updatedAt, + }), }; } @@ -366,16 +547,34 @@ function remoteTagMutationResponse(state: { readonly pendingTags?: string[]; readonly dirty: boolean; readonly updatedAt?: string; + readonly oplogWatermark: RunOplogWatermark; }) { return { tags: state.tags, remote_tags: state.remoteTags, metadata_dirty: state.dirty, ...(state.dirty && { pending_tags: state.pendingTags ?? state.tags }), + ...materializeRunState({ + tags: state.tags, + watermark: state.oplogWatermark, + updatedAt: state.updatedAt, + }), updated_at: state.updatedAt ?? new Date().toISOString(), }; } +function localTagMutationResponse(input: { + readonly tags: readonly string[]; + readonly updatedAt?: string; + readonly watermark?: RunOplogWatermark; +}): RunReadStateFields { + return materializeRunState({ + tags: input.tags, + watermark: input.watermark, + updatedAt: input.updatedAt, + }); +} + function remoteMetadataErrorStatus(error: unknown): 400 | 409 { const message = error instanceof Error ? error.message : String(error); if ( @@ -402,7 +601,7 @@ async function loadManifestResultsForMeta( projectId?: string, ): Promise { await ensureRunReadable(searchDir, meta, projectId); - return loadManifestResults(meta.path); + return loadManifestResults(meta.path, { hydrateTranscriptTrace: false }); } async function loadLightweightResultsForMeta( @@ -824,6 +1023,8 @@ async function handleEvalFiles(c: C, { searchDir, projectId }: DataContext) { if (!record) return c.json({ error: 'Eval not found' }, 404); const baseDir = path.dirname(meta.path); + const transcriptArtifact = resolveRecordArtifactPointer(record, 'transcript'); + const answerArtifact = resolveRecordArtifactPointer(record, 'answer'); const knownPaths = [ record.grading_path, record.timing_path, @@ -832,12 +1033,14 @@ async function handleEvalFiles(c: C, { searchDir, projectId }: DataContext) { record.response_path, record.answer_path, record.transcript_path, + transcriptArtifact.path, + answerArtifact.path, record.task_dir, record.eval_path, record.targets_path, record.files_path, record.graders_path, - ].filter((p): p is string => !!p); + ].filter((p, index, all): p is string => !!p && all.indexOf(p) === index); if (knownPaths.length === 0) return c.json({ files: [] }); @@ -877,39 +1080,93 @@ async function handleEvalFileContent(c: C, { searchDir, projectId }: DataContext await ensureRunReadable(searchDir, meta, projectId); const baseDir = path.dirname(meta.path); - const absolutePath = path.resolve(baseDir, filePath); - - // Security: prevent path traversal — resolved path must be inside baseDir - if ( - !absolutePath.startsWith(path.resolve(baseDir) + path.sep) && - absolutePath !== path.resolve(baseDir) - ) { + const resolvedFile = resolveReadableRunArtifactFile(baseDir, filePath); + if (resolvedFile.error) { return c.json({ error: 'Path traversal not allowed' }, 403); } - - if (!existsSync(absolutePath) || !statSync(absolutePath).isFile()) { + if (!resolvedFile.absolutePath) { return c.json({ error: 'File not found' }, 404); } try { - const fileContent = readFileSync(absolutePath, 'utf8'); + const fileContent = readFileSync(resolvedFile.absolutePath, 'utf8'); if (c.req.query('raw') === '1' || c.req.query('download') === '1') { - c.header('Content-Type', inferRawContentType(absolutePath)); + c.header('Content-Type', inferRawContentType(filePath)); if (c.req.query('download') === '1') { c.header( 'Content-Disposition', - `attachment; filename="${contentDispositionFilename(absolutePath)}"`, + `attachment; filename="${contentDispositionFilename(filePath)}"`, ); } return c.body(fileContent); } - const language = inferLanguage(absolutePath); + const language = inferLanguage(filePath); return c.json({ content: fileContent, language }); } catch { return c.json({ error: 'Failed to read file' }, 500); } } +async function handleEvalTranscript(c: C, { searchDir, projectId }: DataContext) { + const filename = c.req.param('filename') ?? ''; + const evalId = c.req.param('evalId'); + const meta = await findRunById(searchDir, filename, projectId); + if (!meta) return c.json({ error: 'Run not found' }, 404); + + try { + const records = await parseManifestForMeta(searchDir, meta, projectId); + const record = records.find((r) => r.test_id === evalId); + if (!record) return c.json({ error: 'Eval not found' }, 404); + + const baseDir = path.dirname(meta.path); + const transcript = resolveRecordArtifactPointer(record, 'transcript'); + const answer = resolveRecordArtifactPointer(record, 'answer'); + + if (!transcript.path) { + return c.json({ + status: transcript.unsupportedReason ? 'unsupported' : 'missing', + message: transcript.unsupportedReason ?? missingTranscriptMessage(), + ...(transcript.description && { pointer: transcript.description }), + }); + } + + const resolvedTranscript = resolveReadableRunArtifactFile(baseDir, transcript.path); + if (resolvedTranscript.error) { + return c.json({ + status: 'dangling', + transcript_path: transcript.path, + message: resolvedTranscript.error ?? 'Transcript artifact path could not be resolved.', + ...(transcript.description && { pointer: transcript.description }), + }); + } + + if (!resolvedTranscript.absolutePath) { + const refMessage = transcript.ref ? ` on ${transcript.ref}` : ''; + return c.json({ + status: 'dangling', + transcript_path: transcript.path, + message: `Transcript artifact pointer${refMessage} is present, but ${transcript.path} is not available in this run workspace.`, + ...(transcript.description && { pointer: transcript.description }), + }); + } + + const content = readFileSync(resolvedTranscript.absolutePath, 'utf8'); + const answerContent = readOptionalRunArtifactText(baseDir, answer); + + return c.json({ + status: 'ok', + transcript_path: transcript.path, + content, + language: inferLanguage(transcript.path), + ...(answer.path && { answer_path: answer.path }), + ...(answerContent !== undefined && { answer_content: answerContent }), + ...(transcript.description && { pointer: transcript.description }), + }); + } catch { + return c.json({ error: 'Failed to load transcript artifact' }, 500); + } +} + async function handleExperiments(c: C, { searchDir, agentvDir, projectId }: DataContext) { const { runs: metas } = await listMergedResultFiles(searchDir, undefined, projectId); const { threshold: pass_threshold } = loadStudioConfig(agentvDir); @@ -1025,6 +1282,8 @@ async function handleCompare(c: C, { searchDir, agentvDir, projectId }: DataCont remote_tags?: string[]; pending_tags?: string[]; metadata_dirty?: boolean; + final_state: RunFinalState; + oplog_watermark: RunOplogWatermark; source: 'local' | 'remote'; eval_count: number; quality_count: number; @@ -1472,8 +1731,14 @@ async function handleRunTagsPut(c: C, { searchDir, projectId }: DataContext) { } const entry = writeRunTags(meta.path, tags as string[]); + const responseState = localTagMutationResponse({ + tags: entry?.tags ?? [], + updatedAt: entry?.updated_at, + watermark: entry?.oplog_watermark, + }); return c.json({ tags: entry?.tags ?? [], + ...responseState, updated_at: entry?.updated_at ?? new Date().toISOString(), }); } catch (err) { @@ -1494,8 +1759,18 @@ async function handleRunTagsDelete(c: C, { searchDir, projectId }: DataContext) }); } - deleteRunTags(meta.path); - return c.json({ ok: true }); + const entry = writeRunTags(meta.path, []); + const responseState = localTagMutationResponse({ + tags: entry.tags, + updatedAt: entry.updated_at, + watermark: entry.oplog_watermark, + }); + return c.json({ + ok: true, + tags: entry.tags, + ...responseState, + updated_at: entry.updated_at, + }); } catch (err) { return c.json({ error: (err as Error).message }, remoteMetadataErrorStatus(err)); } @@ -1831,6 +2106,8 @@ export function createApp( remote_tags?: string[]; pending_tags?: string[]; metadata_dirty?: boolean; + final_state: RunFinalState; + oplog_watermark: RunOplogWatermark; source: 'local' | 'remote'; project_id: string; project_name: string; @@ -1946,6 +2223,9 @@ export function createApp( handleCategorySuites(c, defaultCtx), ); app.get('/api/runs/:filename/evals/:evalId', (c) => handleEvalDetail(c, defaultCtx)); + app.get('/api/runs/:filename/evals/:evalId/transcript', (c) => + handleEvalTranscript(c, defaultCtx), + ); app.get('/api/runs/:filename/evals/:evalId/files', (c) => handleEvalFiles(c, defaultCtx)); app.get('/api/runs/:filename/evals/:evalId/files/*', (c) => handleEvalFileContent(c, defaultCtx)); app.get('/api/experiments', (c) => handleExperiments(c, defaultCtx)); @@ -1976,11 +2256,11 @@ export function createApp( let testCount = m.testCount; let executionErrorCount = 0; try { - const loaded = await loadManifestResultsForMeta(searchDir, m, defaultCtx.projectId); - totalCostUsd = loaded.reduce((sum, r) => sum + (r.costUsd ?? 0), 0); - if (loaded.length > 0) { + const records = await loadLightweightResultsForMeta(searchDir, m, defaultCtx.projectId); + totalCostUsd = records.reduce((sum, r) => sum + (r.costUsd ?? 0), 0); + if (records.length > 0) { const qualitySummary = summarizeQualityResults( - loaded, + records, loadStudioConfig(agentvDir).threshold, ); testCount = qualitySummary.totalCount; @@ -2064,6 +2344,9 @@ export function createApp( app.get('/api/projects/:projectId/runs/:filename/evals/:evalId', (c) => withProject(c, handleEvalDetail), ); + app.get('/api/projects/:projectId/runs/:filename/evals/:evalId/transcript', (c) => + withProject(c, handleEvalTranscript), + ); app.get('/api/projects/:projectId/runs/:filename/evals/:evalId/files', (c) => withProject(c, handleEvalFiles), ); @@ -2283,19 +2566,19 @@ export const resultsServeCommand = command({ // project's configured run workspace and fall back to the empty state. if (source) { sourceFile = await resolveSourceFile(source, cwd); - results = loadManifestResults(sourceFile); + results = loadManifestResults(sourceFile, { hydrateTranscriptTrace: false }); } else { // Auto-discover: run cache -> directory scan -> empty state const cache = await loadRunCache(cwd); const cachedFile = cache ? resolveRunCacheFile(cache) : ''; if (cachedFile && existsSync(cachedFile)) { sourceFile = cachedFile; - results = loadManifestResults(cachedFile); + results = loadManifestResults(cachedFile, { hydrateTranscriptTrace: false }); } else { const metas = listResultFiles(cwd, 1); if (metas.length > 0) { sourceFile = metas[0].path; - results = loadManifestResults(metas[0].path); + results = loadManifestResults(metas[0].path, { hydrateTranscriptTrace: false }); } // If no metas, results stays empty — dashboard shows welcome state } diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts index 14ab4dfdd..71c1e88e9 100644 --- a/apps/cli/test/commands/eval/artifact-writer.test.ts +++ b/apps/cli/test/commands/eval/artifact-writer.test.ts @@ -1,11 +1,19 @@ import { afterEach, beforeEach, describe, expect, it } from 'bun:test'; +import { createHash } from 'node:crypto'; import { mkdir, readFile, readdir, rm, writeFile } from 'node:fs/promises'; import path from 'node:path'; import { + AGENTV_RESULTS_ARTIFACTS_REF, + CANONICAL_TRACE_ARTIFACT_PATH, + CANONICAL_TRANSCRIPT_ARTIFACT_PATH, + EXECUTION_TRACE_SCHEMA_VERSION, type EvalTest, type EvaluationResult, type GraderResult, + TRACE_JSON_MEDIA_TYPE, + TRANSCRIPT_JSONL_MEDIA_TYPE, + TRANSCRIPT_SCHEMA_VERSION, TraceEnvelopeWireSchema, buildTraceFromMessages, fromTraceEnvelopeWire, @@ -28,6 +36,8 @@ import { writeArtifacts, writeArtifactsFromResults, } from '../../../src/commands/eval/artifact-writer.js'; +import { prepareResultForJsonl } from '../../../src/commands/eval/run-eval.js'; +import { toSnakeCaseDeep } from '../../../src/utils/case-conversion.js'; function makeResult(overrides: Partial = {}): EvaluationResult { const result = { @@ -72,6 +82,10 @@ function makeEvaluatorResult(overrides: Partial = {}): GraderResul } as GraderResult; } +function sha256Hex(content: Buffer): string { + return createHash('sha256').update(content).digest('hex'); +} + // --------------------------------------------------------------------------- // Grading artifact // --------------------------------------------------------------------------- @@ -587,6 +601,59 @@ describe('parseJsonlResults', () => { expect(results[0].trace.toolCalls).toEqual({ rg: 1 }); }); + it('rejects camelCase artifact pointer rows for the new wire field', () => { + const content = `${JSON.stringify({ + test_id: 'pointer-row', + target: 'codex', + score: 1, + artifactPointers: { + transcript: { + ref: 'agentv/artifacts/v1', + key: 'transcripts/pointer-row/outputs/transcript.jsonl', + object_version: 'sha256:test', + path: 'pointer-row/outputs/transcript.jsonl', + sha256: 'test', + size: 1, + schema_version: 'agentv.transcript.v1', + media_type: 'application/x-ndjson', + family: 'transcripts', + }, + }, + })}\n`; + + expect(() => parseJsonlResults(content)).toThrow(/Use "artifact_pointers"/); + }); + + it('does not treat parsed raw provider log pointers as fresh source artifacts', () => { + const content = `${JSON.stringify({ + test_id: 'raw-log-case', + target: 'codex', + score: 1, + output: 'done', + raw_provider_log_path: 'raw-log-case/outputs/raw/provider.log', + })}\n`; + + const results = parseJsonlResults(content); + + expect(results).toHaveLength(1); + expect(results[0].rawProviderLogPath).toBeUndefined(); + }); + + it('preserves raw provider log pointer metadata at the per-case JSONL boundary', () => { + const rawLogPath = path.join(import.meta.dir, '.test-provider-source.log'); + const result = makeResult({ + testId: 'raw-log-jsonl-case', + rawProviderLogPath: rawLogPath, + }); + + const prepared = prepareResultForJsonl(result, { outputMessages: 1 }); + const wire = toSnakeCaseDeep(prepared) as Record; + + expect(prepared.rawProviderLogPath).toBe(rawLogPath); + expect(wire.raw_provider_log_path).toBe(rawLogPath); + expect(wire).not.toHaveProperty('raw_provider_log'); + }); + it('handles empty content', () => { expect(parseJsonlResults('')).toHaveLength(0); }); @@ -828,9 +895,8 @@ describe('writeArtifactsFromResults', () => { await writeArtifactsFromResults(results, testDir); - const transcriptLines = ( - await readFile(path.join(testDir, 'transcript-case', 'outputs', 'transcript.jsonl'), 'utf8') - ) + const transcriptPath = path.join(testDir, 'transcript-case', 'outputs', 'transcript.jsonl'); + const transcriptLines = (await readFile(transcriptPath, 'utf8')) .trim() .split('\n') .map((line) => JSON.parse(line)); @@ -911,7 +977,8 @@ describe('writeArtifactsFromResults', () => { expect(transcriptLines[1]).not.toHaveProperty('providerSessionId'); expect(envelope.schema_version).toBe('agentv.trace.v1'); expect(envelope.artifact_id).toMatch(/^execution-trace-/); - expect(envelope.artifacts.trace_path).toBe('outputs/trace.json'); + expect(envelope.artifacts.trace_path).toBe(CANONICAL_TRACE_ARTIFACT_PATH); + expect(envelope.artifacts.transcript_path).toBe(CANONICAL_TRANSCRIPT_ARTIFACT_PATH); expect(envelope.artifacts).not.toHaveProperty('execution_trace_path'); expect(envelope.eval.test_id).toBe('transcript-case'); expect(envelope.trace.spans.map((span) => span.attributes['gen_ai.operation.name'])).toEqual([ @@ -919,12 +986,94 @@ describe('writeArtifactsFromResults', () => { 'chat', 'execute_tool', ]); + await expect( + readFile(path.join(testDir, 'transcript-case', 'outputs', 'transcript.json'), 'utf8'), + ).rejects.toThrow(); const indexLine = JSON.parse( (await readFile(path.join(testDir, 'index.jsonl'), 'utf8')).trim(), ); expect(indexLine.transcript_path).toBe('transcript-case/outputs/transcript.jsonl'); + expect(indexLine.transcript_path.endsWith(CANONICAL_TRANSCRIPT_ARTIFACT_PATH)).toBe(true); expect(indexLine).not.toHaveProperty('trace_path'); + + const traceContent = await readFile( + path.join(testDir, 'transcript-case', 'outputs', 'trace.json'), + ); + const transcriptContent = await readFile(transcriptPath); + const traceSha = sha256Hex(traceContent); + const transcriptSha = sha256Hex(transcriptContent); + + expect(indexLine.artifact_pointers.trace).toMatchObject({ + ref: AGENTV_RESULTS_ARTIFACTS_REF, + key: 'traces/transcript-case/outputs/trace.json', + object_version: `sha256:${traceSha}`, + path: 'transcript-case/outputs/trace.json', + sha256: traceSha, + size: traceContent.byteLength, + schema_version: EXECUTION_TRACE_SCHEMA_VERSION, + media_type: TRACE_JSON_MEDIA_TYPE, + family: 'traces', + }); + expect(indexLine.artifact_pointers.transcript).toMatchObject({ + ref: AGENTV_RESULTS_ARTIFACTS_REF, + key: 'transcripts/transcript-case/outputs/transcript.jsonl', + object_version: `sha256:${transcriptSha}`, + path: 'transcript-case/outputs/transcript.jsonl', + sha256: transcriptSha, + size: transcriptContent.byteLength, + schema_version: TRANSCRIPT_SCHEMA_VERSION, + media_type: TRANSCRIPT_JSONL_MEDIA_TYPE, + family: 'transcripts', + }); + }); + + it('copies optional raw provider logs as non-canonical evidence', async () => { + const rawLogPath = path.join(testDir, 'provider-source.log'); + const rawLog = [ + '# provider-native stream log', + '{"time":"00:00","data":{"camelCaseProviderKey":true,"toolInput":{"filePath":"src/index.ts"}}}', + '', + ].join('\n'); + await mkdir(testDir, { recursive: true }); + await writeFile(rawLogPath, rawLog, 'utf8'); + + const results = [ + makeResult({ + testId: 'raw-log-case', + target: 'codex', + output: 'Raw log copied', + rawProviderLogPath: rawLogPath, + }), + ]; + + await writeArtifactsFromResults(results, testDir); + + const copiedRawLogPath = path.join(testDir, 'raw-log-case', 'outputs', 'raw', 'provider.log'); + expect(await readFile(copiedRawLogPath, 'utf8')).toBe(rawLog); + + const transcriptPath = path.join(testDir, 'raw-log-case', 'outputs', 'transcript.jsonl'); + await expect(readFile(transcriptPath, 'utf8')).resolves.toContain( + '"schema_version":"agentv.transcript.v1"', + ); + await expect( + readFile(path.join(testDir, 'raw-log-case', 'outputs', 'transcript.json'), 'utf8'), + ).rejects.toThrow(); + + const envelope = TraceEnvelopeWireSchema.parse( + JSON.parse( + await readFile(path.join(testDir, 'raw-log-case', 'outputs', 'trace.json'), 'utf8'), + ), + ); + expect(envelope.artifacts.raw_provider_log_path).toBe('outputs/raw/provider.log'); + expect(envelope.artifacts.transcript_path).toBe('outputs/transcript.jsonl'); + + const indexLine = JSON.parse( + (await readFile(path.join(testDir, 'index.jsonl'), 'utf8')).trim(), + ); + expect(indexLine.raw_provider_log_path).toBe('raw-log-case/outputs/raw/provider.log'); + expect(indexLine.transcript_path).toBe('raw-log-case/outputs/transcript.jsonl'); + expect(indexLine).not.toHaveProperty('transcript_json_path'); }); it('omits per-test transcript links when the execution trace has no transcript rows', async () => { @@ -945,6 +1094,15 @@ describe('writeArtifactsFromResults', () => { (await readFile(path.join(testDir, 'index.jsonl'), 'utf8')).trim(), ); expect(indexLine).not.toHaveProperty('transcript_path'); + expect(indexLine.artifact_pointers.trace).toMatchObject({ + ref: AGENTV_RESULTS_ARTIFACTS_REF, + key: 'traces/no-transcript-case/outputs/trace.json', + path: 'no-transcript-case/outputs/trace.json', + schema_version: EXECUTION_TRACE_SCHEMA_VERSION, + media_type: TRACE_JSON_MEDIA_TYPE, + family: 'traces', + }); + expect(indexLine.artifact_pointers).not.toHaveProperty('transcript'); const envelope = TraceEnvelopeWireSchema.parse( JSON.parse( diff --git a/apps/cli/test/commands/results/combine.test.ts b/apps/cli/test/commands/results/combine.test.ts index 9d5dfc5d3..889ef84d0 100644 --- a/apps/cli/test/commands/results/combine.test.ts +++ b/apps/cli/test/commands/results/combine.test.ts @@ -14,6 +14,13 @@ function toJsonl(...records: object[]): string { return `${records.map((record) => JSON.stringify(record)).join('\n')}\n`; } +function readIndex(filePath: string): Record[] { + return readFileSync(filePath, 'utf8') + .trim() + .split('\n') + .map((line) => JSON.parse(line) as Record); +} + const result = (overrides: Record = {}) => ({ timestamp: '2026-06-01T10:00:00.000Z', test_id: 'test-a', @@ -87,6 +94,108 @@ describe('results combine', () => { expect(benchmark.metadata.timestamp).toBe('2026-06-01T10:00:00.000Z'); }); + it('copies and rewrites artifact pointers when combining runs', () => { + const first = seedRun('run-a', [ + result({ + artifact_dir: 'demo/test-a', + transcript_path: 'demo/test-a/outputs/transcript.jsonl', + raw_provider_log_path: 'demo/test-a/outputs/raw/provider.log', + artifact_pointers: { + trace: { + ref: 'agentv/artifacts/v1', + key: 'traces/demo/test-a/outputs/trace.json', + object_version: 'sha256:trace', + path: 'demo/test-a/outputs/trace.json', + sha256: 'trace', + size: 18, + schema_version: 'agentv.trace.v1', + media_type: 'application/vnd.agentv.trace.v1+json', + family: 'traces', + }, + transcript: { + ref: 'agentv/artifacts/v1', + key: 'transcripts/demo/test-a/outputs/transcript.jsonl', + object_version: 'sha256:transcript', + path: 'demo/test-a/outputs/transcript.jsonl', + sha256: 'transcript', + size: 180, + schema_version: 'agentv.transcript.v1', + media_type: 'application/x-ndjson', + family: 'transcripts', + }, + }, + }), + ]); + mkdirSync(path.join(first, 'demo', 'test-a', 'outputs', 'raw'), { recursive: true }); + writeFileSync(path.join(first, 'demo', 'test-a', 'outputs', 'trace.json'), '{"trace":[]}\n'); + writeFileSync( + path.join(first, 'demo', 'test-a', 'outputs', 'transcript.jsonl'), + `${JSON.stringify({ + schema_version: 'agentv.transcript.v1', + test_id: 'test-a', + target: 'mock', + message_index: 0, + role: 'assistant', + content: 'Pointer-backed transcript', + source: { provider: 'mock', session_id: 'session-a' }, + })}\n`, + ); + writeFileSync( + path.join(first, 'demo', 'test-a', 'outputs', 'raw', 'provider.log'), + '{"event":"provider-native"}\n', + ); + const second = seedRun('run-b', [ + result({ + timestamp: '2026-06-01T11:00:00.000Z', + test_id: 'test-b', + grading_path: 'demo/test-b/grading.json', + timing_path: 'demo/test-b/timing.json', + }), + ]); + mkdirSync(path.join(second, 'demo', 'test-b'), { recursive: true }); + writeFileSync(path.join(second, 'demo', 'test-b', 'grading.json'), '{"assertions":[]}\n'); + writeFileSync( + path.join(second, 'demo', 'test-b', 'timing.json'), + '{"duration_ms":0,"total_duration_seconds":0,"total_tokens":0,"token_usage":{}}\n', + ); + + const combined = combineRunSources({ + cwd: tempDir, + sources: buildCombineRunSources([first, second], tempDir), + duplicatePolicy: 'error', + }); + + const [record] = readIndex(combined.manifestPath); + expect(record.artifact_dir).toBe('sources/source-1/demo/test-a'); + expect(record.transcript_path).toBe('sources/source-1/demo/test-a/outputs/transcript.jsonl'); + expect(record.raw_provider_log_path).toBe( + 'sources/source-1/demo/test-a/outputs/raw/provider.log', + ); + expect(record.artifact_pointers).toMatchObject({ + trace: { + key: 'traces/sources/source-1/demo/test-a/outputs/trace.json', + path: 'sources/source-1/demo/test-a/outputs/trace.json', + }, + transcript: { + key: 'transcripts/sources/source-1/demo/test-a/outputs/transcript.jsonl', + path: 'sources/source-1/demo/test-a/outputs/transcript.jsonl', + }, + }); + expect( + existsSync(path.join(combined.runDir, 'sources/source-1/demo/test-a/outputs/trace.json')), + ).toBe(true); + expect( + existsSync( + path.join(combined.runDir, 'sources/source-1/demo/test-a/outputs/transcript.jsonl'), + ), + ).toBe(true); + expect( + existsSync( + path.join(combined.runDir, 'sources/source-1/demo/test-a/outputs/raw/provider.log'), + ), + ).toBe(true); + }); + it('errors on duplicate rows unless latest is explicit', () => { const first = seedRun('run-a', [result({ timestamp: '2026-06-01T10:00:00.000Z', score: 0.1 })]); const second = seedRun('run-b', [ diff --git a/apps/cli/test/commands/results/export.test.ts b/apps/cli/test/commands/results/export.test.ts index c034089f7..434c45136 100644 --- a/apps/cli/test/commands/results/export.test.ts +++ b/apps/cli/test/commands/results/export.test.ts @@ -9,12 +9,18 @@ import type { IndexArtifactEntry, TimingArtifact, } from '../../../src/commands/eval/artifact-writer.js'; +import { parseJsonlResults } from '../../../src/commands/eval/artifact-writer.js'; import { + buildProjectionBundleFromExportedIndex, deriveExportRunId, deriveOutputDir, exportResults, loadExportSource, } from '../../../src/commands/results/export.js'; +import { + buildProjectionBundle, + serializeProjectionBundle, +} from '../../../src/commands/results/projection-bundle.js'; // ── Sample JSONL content (snake_case, matching on-disk format) ────────── @@ -96,6 +102,63 @@ const RESULT_NO_TRACE = { duration_ms: 500, }; +const RESULT_WITH_RAW_PAYLOADS = { + timestamp: '2026-03-18T10:00:20.000Z', + test_id: 'test-private', + suite: 'privacy', + score: 0.25, + assertions: [ + { + text: 'Avoids private content', + passed: false, + evidence: 'SECRET_ASSERTION_EVIDENCE', + }, + ], + output: 'SECRET_FINAL_OUTPUT', + target: 'codex', + input: [{ role: 'user', content: 'SECRET_PROMPT_TEXT' }], + scores: [ + { + name: 'privacy_review', + type: 'llm-grader', + score: 0.25, + assertions: [ + { + text: 'Avoids private content', + passed: false, + evidence: 'SECRET_SCORE_EVIDENCE', + }, + ], + details: { excerpt: 'SECRET_SCORE_DETAILS' }, + }, + ], + execution_status: 'quality_failure', + duration_ms: 900, + trace: { + messages: [ + { role: 'user', content: 'SECRET_PROMPT_TEXT' }, + { + role: 'assistant', + content: 'SECRET_FINAL_OUTPUT', + tool_calls: [ + { + id: 'tool-call-1', + tool: 'shell', + input: { command: 'cat SECRET_TOOL_ARGUMENTS' }, + output: 'SECRET_TOOL_RESULT', + status: 'ok', + }, + ], + }, + ], + events: [], + event_count: 2, + tool_calls: { shell: 1 }, + error_count: 0, + llm_call_count: 1, + }, +}; + function toJsonl(...records: object[]): string { return `${records.map((r) => JSON.stringify(r)).join('\n')}\n`; } @@ -184,6 +247,127 @@ describe('results export', () => { expect(deriveExportRunId(path.join(tempDir, 'legacy-results.jsonl'))).toBe('legacy-results'); }); + it('builds deterministic metadata-only projection bundle output for dry-run use', () => { + const sourceFile = path.join(tempDir, 'runs', 'privacy-run', 'index.jsonl'); + const [result] = parseJsonlResults(toJsonl(RESULT_WITH_RAW_PAYLOADS)); + + const first = buildProjectionBundle([result], { + sourceFile, + runId: 'privacy-run', + cwd: tempDir, + duplicatePolicy: 'update', + }); + const second = buildProjectionBundle([result], { + sourceFile, + runId: 'privacy-run', + cwd: tempDir, + duplicatePolicy: 'update', + }); + const serialized = serializeProjectionBundle(first); + + expect(serialized).toBe(serializeProjectionBundle(second)); + expect(first.content_policy).toMatchObject({ + raw_content: 'excluded', + raw_content_opt_in: false, + default_capture: 'metadata', + }); + expect(first.entries[0].artifact_refs).toMatchObject({ + status: 'planned_export', + timing_path: 'privacy/test-private/timing.json', + }); + expect(first.entries[0].artifact_refs).not.toHaveProperty('input_path'); + expect(first.entries[0].artifact_refs).not.toHaveProperty('output_path'); + expect(first.entries[0].artifact_refs).not.toHaveProperty('answer_path'); + expect(first.entries[0].artifact_refs).not.toHaveProperty('response_path'); + expect(first.entries[0].artifact_refs).not.toHaveProperty('transcript_path'); + expect(first.entries[0].artifact_refs).not.toHaveProperty('trace_path'); + expect(first.entries[0].feedback).not.toHaveProperty('grading_path'); + expect(first.entries[0].trace).not.toHaveProperty('envelope_ref'); + expect(first.entries[0].trace_envelope).not.toHaveProperty('artifacts'); + expect(first.entries[0].projection_identity.dimensions.run_id).toBe('privacy-run'); + expect(first.entries[0].trace_envelope.trace.spans.length).toBeGreaterThan(0); + expect(first.entries[0].feedback.scores?.[0]).not.toHaveProperty('evidence'); + expect(serialized).not.toContain('SECRET_PROMPT_TEXT'); + expect(serialized).not.toContain('SECRET_FINAL_OUTPUT'); + expect(serialized).not.toContain('SECRET_TOOL_ARGUMENTS'); + expect(serialized).not.toContain('SECRET_TOOL_RESULT'); + expect(serialized).not.toContain('SECRET_SCORE_EVIDENCE'); + }); + + it('builds projection bundles when grader scores omit assertion arrays', () => { + const sourceFile = path.join(tempDir, 'runs', 'legacy-grader-run', 'index.jsonl'); + const [result] = parseJsonlResults( + toJsonl({ + ...RESULT_FULL, + scores: [ + { + name: 'legacy_grader', + type: 'llm-grader', + score: 1, + }, + ], + }), + ); + + const bundle = buildProjectionBundle([result], { + sourceFile, + runId: 'legacy-grader-run', + cwd: tempDir, + }); + + expect(bundle.entries[0].feedback.scores?.[0]).toMatchObject({ + name: 'legacy_grader', + type: 'llm-grader', + score: 1, + }); + expect(bundle.entries[0].trace_envelope.scores?.[0]).toMatchObject({ + name: 'legacy_grader', + type: 'llm-grader', + score: 1, + }); + }); + + it('includes raw prompt, output, tool payloads, and score evidence only with opt-in', () => { + const sourceFile = path.join(tempDir, 'runs', 'privacy-run', 'index.jsonl'); + const [result] = parseJsonlResults(toJsonl(RESULT_WITH_RAW_PAYLOADS)); + + const bundle = buildProjectionBundle([result], { + sourceFile, + runId: 'privacy-run', + cwd: tempDir, + includeRawContent: true, + }); + const serialized = serializeProjectionBundle(bundle); + + expect(bundle.content_policy).toMatchObject({ + raw_content: 'included', + raw_content_opt_in: true, + default_capture: 'full', + }); + expect(bundle.entries[0].capture).toMatchObject({ + content: 'full', + redaction_level: 'none', + }); + expect(bundle.entries[0].artifact_refs).toMatchObject({ + status: 'planned_export', + input_path: 'privacy/test-private/input.md', + output_path: 'privacy/test-private/outputs/answer.md', + answer_path: 'privacy/test-private/outputs/answer.md', + response_path: 'privacy/test-private/outputs/response.md', + trace_path: 'privacy/test-private/outputs/trace.json', + }); + expect(bundle.entries[0].trace.envelope_ref).toBe('privacy/test-private/outputs/trace.json'); + expect(bundle.entries[0].trace_envelope.artifacts).toBeDefined(); + expect(bundle.entries[0].feedback.grading_path).toBe('privacy/test-private/grading.json'); + expect(bundle.entries[0].raw_content).toBeDefined(); + expect(bundle.entries[0].feedback.scores?.[0]).toHaveProperty('evidence'); + expect(serialized).toContain('SECRET_PROMPT_TEXT'); + expect(serialized).toContain('SECRET_FINAL_OUTPUT'); + expect(serialized).toContain('SECRET_TOOL_ARGUMENTS'); + expect(serialized).toContain('SECRET_TOOL_RESULT'); + expect(serialized).toContain('SECRET_SCORE_EVIDENCE'); + }); + it('should create benchmark.json matching artifact-writer schema', async () => { const outputDir = path.join(tempDir, 'output'); const content = toJsonl(RESULT_FULL, RESULT_PARTIAL); @@ -319,6 +503,34 @@ describe('results export', () => { expect(readAnswer(outputDir, RESULT_FULL)).toBe('Hello, Alice!'); }); + it('builds projection bundles from emitted skipped artifacts for duplicate policy skip', async () => { + const sourceFile = path.join(tempDir, 'runs', 'retry-run', 'index.jsonl'); + const outputDir = path.join(tempDir, 'output'); + const updated = { ...RESULT_FULL, output: 'Skipped answer.' }; + + await exportResults(sourceFile, toJsonl(RESULT_FULL), outputDir, { + duplicatePolicy: 'update', + }); + await exportResults(sourceFile, toJsonl(updated), outputDir, { + duplicatePolicy: 'skip', + }); + + const bundle = buildProjectionBundleFromExportedIndex({ + sourceFile, + outputDir, + cwd: tempDir, + includeRawContent: true, + duplicatePolicy: 'skip', + }); + + expect(bundle.entries[0].artifact_refs.status).toBe('emitted'); + expect(bundle.entries[0].raw_content?.output).toBe('Hello, Alice!'); + expect(serializeProjectionBundle(bundle)).not.toContain('Skipped answer.'); + expect(bundle.entries[0].trace_envelope.projection_identity).toEqual( + readIndex(outputDir)[0].projection_identity, + ); + }); + it('fails duplicate projection artifacts when duplicate policy is error', async () => { const sourceFile = path.join(tempDir, 'runs', 'retry-run', 'index.jsonl'); const outputDir = path.join(tempDir, 'output'); diff --git a/apps/cli/test/commands/results/remote-metadata.test.ts b/apps/cli/test/commands/results/remote-metadata.test.ts index fb66e430b..44fe86dbc 100644 --- a/apps/cli/test/commands/results/remote-metadata.test.ts +++ b/apps/cli/test/commands/results/remote-metadata.test.ts @@ -11,6 +11,7 @@ import { readRemoteRunTags, writeRemoteRunTags, } from '../../../src/commands/results/remote-metadata.js'; +import { RUN_OPLOG_REF } from '../../../src/commands/results/run-oplog.js'; const RUN_TIMESTAMP = '2026-06-06T10-00-00-000Z'; @@ -33,7 +34,10 @@ function git(cmd: string, cwd: string): string { }).trim(); } -function seedRepo(repoDir: string): string { +function seedRepo( + repoDir: string, + options?: { readonly artifactTags?: readonly string[] }, +): string { git('git init --quiet', repoDir); git('git config user.email "test@example.com"', repoDir); git('git config user.name "Test User"', repoDir); @@ -41,10 +45,17 @@ function seedRepo(repoDir: string): string { const runDir = path.join(repoDir, 'runs', 'default', RUN_TIMESTAMP); mkdirSync(runDir, { recursive: true }); writeFileSync(path.join(runDir, 'index.jsonl'), '{"test_id":"alpha","score":1}\n'); - writeFileSync( - path.join(runDir, 'tags.json'), - `${JSON.stringify({ tags: ['remote-baseline'], updated_at: '2026-06-06T09:00:00.000Z' }, null, 2)}\n`, - ); + const artifactTags = options?.artifactTags ?? ['remote-baseline']; + if (artifactTags.length > 0) { + writeFileSync( + path.join(runDir, 'tags.json'), + `${JSON.stringify( + { tags: artifactTags, updated_at: '2026-06-06T09:00:00.000Z' }, + null, + 2, + )}\n`, + ); + } git('git add runs', repoDir); git('git commit --quiet -m "seed remote run"', repoDir); return path.join(runDir, 'index.jsonl'); @@ -72,6 +83,8 @@ describe('remote metadata tags', () => { expect(state.remoteTags).toEqual(['remote-baseline']); expect(state.pendingTags).toEqual(['pending', 'remote-baseline']); expect(state.dirty).toBe(true); + expect(state.oplogWatermark.ref).toBe(RUN_OPLOG_REF); + expect(state.oplogWatermark.operation_id).toBeString(); expect(state.metadataPath).toContain( path.join('metadata', 'runs', 'default', RUN_TIMESTAMP, 'tags.json'), ); @@ -83,6 +96,7 @@ describe('remote metadata tags', () => { expect(reloaded.tags).toEqual(['pending', 'remote-baseline']); expect(reloaded.pendingTags).toEqual(['pending', 'remote-baseline']); expect(reloaded.dirty).toBe(true); + expect(reloaded.oplogWatermark.operation_id).toBe(state.oplogWatermark.operation_id); }); it('uses committed metadata overlays as the clean remote baseline', () => { @@ -98,6 +112,7 @@ describe('remote metadata tags', () => { expect(reloaded.remoteTags).toEqual(['accepted']); expect(reloaded.pendingTags).toBeUndefined(); expect(reloaded.dirty).toBe(false); + expect(reloaded.oplogWatermark.ref).toBe(RUN_OPLOG_REF); }); it('persists clearing remote tags as an empty pending overlay', () => { @@ -112,6 +127,25 @@ describe('remote metadata tags', () => { expect(readFileSync(state.metadataPath, 'utf8')).toContain('"tags": []'); }); + it('records an explicit clear watermark when the remote baseline is already empty', () => { + const manifestPath = seedRepo(repoDir, { artifactTags: [] }); + + const state = writeRemoteRunTags(repoDir, manifestPath, []); + const metadata = JSON.parse(readFileSync(state.metadataPath, 'utf8')) as { + tags: string[]; + oplog_watermark: { ref: string; operation_id?: string; updated_at?: string }; + }; + + expect(state.tags).toEqual([]); + expect(state.remoteTags).toEqual([]); + expect(state.pendingTags).toEqual([]); + expect(state.dirty).toBe(true); + expect(state.oplogWatermark.ref).toBe(RUN_OPLOG_REF); + expect(state.oplogWatermark.operation_id).toBeString(); + expect(metadata.tags).toEqual([]); + expect(metadata.oplog_watermark.operation_id).toBe(state.oplogWatermark.operation_id); + }); + it('rejects writes when the configured results path is not a git checkout', () => { const runDir = path.join(repoDir, 'runs', 'default', RUN_TIMESTAMP); mkdirSync(runDir, { recursive: true }); diff --git a/apps/cli/test/commands/results/run-oplog.test.ts b/apps/cli/test/commands/results/run-oplog.test.ts new file mode 100644 index 000000000..ff6990bb0 --- /dev/null +++ b/apps/cli/test/commands/results/run-oplog.test.ts @@ -0,0 +1,108 @@ +import { describe, expect, it } from 'bun:test'; +import { execFileSync } from 'node:child_process'; + +import { + RUN_OPERATION_SCHEMA_VERSION, + RUN_OPLOG_REF, + buildRunIdFromRelativePath, + createRunTagsSetOperation, + materializeRunState, + watermarkFromRunOperation, +} from '../../../src/commands/results/run-oplog.js'; + +const PRIMARY_RESULTS_REF = 'agentv/results/v1'; +const ARTIFACTS_REF = 'agentv/artifacts/v1'; + +function refsHavePrefixConflict(left: string, right: string): boolean { + return left === right || left.startsWith(`${right}/`) || right.startsWith(`${left}/`); +} + +function isValidGitBranchRef(ref: string): boolean { + try { + execFileSync('git', ['check-ref-format', `refs/heads/${ref}`], { stdio: 'ignore' }); + return true; + } catch { + return false; + } +} + +describe('run operation log contract', () => { + it('defines the stable oplog ref', () => { + expect(RUN_OPLOG_REF).toBe('agentv/oplog/v1'); + }); + + it('keeps results, artifacts, and oplog refs non-prefix-conflicting', () => { + const refs = [PRIMARY_RESULTS_REF, ARTIFACTS_REF, RUN_OPLOG_REF]; + + for (const left of refs) { + expect(isValidGitBranchRef(left)).toBe(true); + } + + for (const [index, left] of refs.entries()) { + for (const right of refs.slice(index + 1)) { + expect(refsHavePrefixConflict(left, right)).toBe(false); + } + } + }); + + it('builds a typed tag replacement operation envelope', () => { + const operation = createRunTagsSetOperation({ + runId: 'smoke::2026-06-21T10-00-00-000Z', + runPath: 'smoke/2026-06-21T10-00-00-000Z', + tags: ['baseline', 'reviewed'], + actor: { kind: 'dashboard', id: 'local' }, + authoredAt: '2026-06-21T10:15:00.000Z', + operationId: 'op-123', + }); + + expect(operation).toEqual({ + schema_version: RUN_OPERATION_SCHEMA_VERSION, + operation_id: 'op-123', + operation_type: 'run.tags.set', + authored_at: '2026-06-21T10:15:00.000Z', + actor: { kind: 'dashboard', id: 'local' }, + subject: { + run_id: 'smoke::2026-06-21T10-00-00-000Z', + run_path: 'smoke/2026-06-21T10-00-00-000Z', + }, + payload: { + tags: ['baseline', 'reviewed'], + }, + }); + }); + + it('materializes final run state from tags and an operation watermark', () => { + const operation = createRunTagsSetOperation({ + runId: '2026-06-21T10-00-00-000Z', + tags: ['accepted'], + authoredAt: '2026-06-21T10:15:00.000Z', + operationId: 'op-456', + }); + + expect( + materializeRunState({ + tags: operation.payload.tags, + watermark: watermarkFromRunOperation(operation), + }), + ).toEqual({ + final_state: { + lifecycle: 'active', + tags: ['accepted'], + }, + oplog_watermark: { + ref: RUN_OPLOG_REF, + operation_id: 'op-456', + updated_at: '2026-06-21T10:15:00.000Z', + }, + }); + }); + + it('derives run IDs from results branch paths', () => { + expect(buildRunIdFromRelativePath('default/2026-06-21T10-00-00-000Z')).toBe( + '2026-06-21T10-00-00-000Z', + ); + expect(buildRunIdFromRelativePath('smoke/2026-06-21T10-00-00-000Z')).toBe( + 'smoke::2026-06-21T10-00-00-000Z', + ); + }); +}); diff --git a/apps/cli/test/commands/results/run-tags.test.ts b/apps/cli/test/commands/results/run-tags.test.ts new file mode 100644 index 000000000..23dabb79c --- /dev/null +++ b/apps/cli/test/commands/results/run-tags.test.ts @@ -0,0 +1,52 @@ +import { afterEach, beforeEach, describe, expect, it } from 'bun:test'; +import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; + +import { RUN_OPLOG_REF } from '../../../src/commands/results/run-oplog.js'; +import { + deleteRunTags, + readRunTags, + runTagsPath, + writeRunTags, +} from '../../../src/commands/results/run-tags.js'; + +describe('run tags sidecar', () => { + let tempDir: string; + let manifestPath: string; + + beforeEach(() => { + tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-run-tags-')); + const runDir = path.join(tempDir, '.agentv', 'results', 'runs', 'default', '2026-clear-tags'); + mkdirSync(runDir, { recursive: true }); + manifestPath = path.join(runDir, 'index.jsonl'); + writeFileSync(manifestPath, '{"test_id":"alpha","score":1}\n', 'utf8'); + }); + + afterEach(() => { + rmSync(tempDir, { recursive: true, force: true }); + }); + + it('records empty tags as a clear tombstone with an oplog watermark', () => { + writeRunTags(manifestPath, ['baseline']); + + const cleared = writeRunTags(manifestPath, []); + const reloaded = readRunTags(manifestPath); + + expect(existsSync(runTagsPath(manifestPath))).toBe(true); + expect(cleared.tags).toEqual([]); + expect(cleared.oplog_watermark?.ref).toBe(RUN_OPLOG_REF); + expect(cleared.oplog_watermark?.operation_id).toBeString(); + expect(reloaded).toEqual(cleared); + expect(readFileSync(runTagsPath(manifestPath), 'utf8')).toContain('"tags": []'); + }); + + it('keeps physical sidecar deletion explicit', () => { + writeRunTags(manifestPath, []); + + deleteRunTags(manifestPath); + + expect(existsSync(runTagsPath(manifestPath))).toBe(false); + expect(readRunTags(manifestPath)).toBeUndefined(); + }); +}); diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index 834d041ca..389089eaf 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -1,6 +1,14 @@ import { afterEach, beforeEach, describe, expect, it, spyOn } from 'bun:test'; import { execFileSync, execSync } from 'node:child_process'; -import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'; +import { + existsSync, + mkdirSync, + mkdtempSync, + readFileSync, + rmSync, + symlinkSync, + writeFileSync, +} from 'node:fs'; import os from 'node:os'; import { tmpdir } from 'node:os'; import path from 'node:path'; @@ -8,6 +16,7 @@ import { fileURLToPath } from 'node:url'; import { addProject, saveProjectRegistry } from '@agentv/core'; +import { RUN_OPLOG_REF } from '../../../src/commands/results/run-oplog.js'; import { createApp, loadResults, @@ -934,7 +943,13 @@ describe('serve app', () => { expect(res.status).toBe(200); const data = (await res.json()) as { - runs: Array<{ filename: string; source: string; on_remote: boolean }>; + runs: Array<{ + filename: string; + source: string; + on_remote: boolean; + final_state: { lifecycle: string; tags: string[] }; + oplog_watermark: { ref: string }; + }>; }; expect(data.runs).toHaveLength(1); // A local-only run (no remote configured) is not on the remote branch. @@ -942,6 +957,159 @@ describe('serve app', () => { filename, source: 'local', on_remote: false, + final_state: { + lifecycle: 'active', + tags: [], + }, + oplog_watermark: { + ref: RUN_OPLOG_REF, + }, + }); + }); + + it('exposes materialized final state and oplog watermark for local run tags', async () => { + const runsDir = path.join(tempDir, '.agentv', 'results', 'runs'); + mkdirSync(runsDir, { recursive: true }); + const filename = '2026-03-25T10-00-00-000Z'; + const runDir = path.join(runsDir, filename); + mkdirSync(runDir, { recursive: true }); + writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(RESULT_A)); + writeFileSync( + path.join(runDir, 'tags.json'), + `${JSON.stringify( + { + tags: ['accepted'], + updated_at: '2026-06-21T10:15:00.000Z', + oplog_watermark: { + ref: RUN_OPLOG_REF, + operation_id: 'op-local-tags', + updated_at: '2026-06-21T10:15:00.000Z', + }, + }, + null, + 2, + )}\n`, + ); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + + const listRes = await app.request('/api/runs'); + expect(listRes.status).toBe(200); + const listData = (await listRes.json()) as { + runs: Array<{ + tags: string[]; + final_state: { lifecycle: string; tags: string[] }; + oplog_watermark: { ref: string; operation_id?: string; updated_at?: string }; + }>; + }; + expect(listData.runs[0]).toMatchObject({ + tags: ['accepted'], + final_state: { + lifecycle: 'active', + tags: ['accepted'], + }, + oplog_watermark: { + ref: RUN_OPLOG_REF, + operation_id: 'op-local-tags', + updated_at: '2026-06-21T10:15:00.000Z', + }, + }); + + const detailRes = await app.request(`/api/runs/${encodeURIComponent(filename)}`); + expect(detailRes.status).toBe(200); + const detailData = (await detailRes.json()) as { + tags: string[]; + final_state: { lifecycle: string; tags: string[] }; + oplog_watermark: { ref: string; operation_id?: string; updated_at?: string }; + }; + expect(detailData).toMatchObject({ + tags: ['accepted'], + final_state: { + lifecycle: 'active', + tags: ['accepted'], + }, + oplog_watermark: { + ref: RUN_OPLOG_REF, + operation_id: 'op-local-tags', + updated_at: '2026-06-21T10:15:00.000Z', + }, + }); + }); + + it('preserves a local tag clear watermark after DELETE /tags', async () => { + const runsDir = path.join(tempDir, '.agentv', 'results', 'runs'); + mkdirSync(runsDir, { recursive: true }); + const filename = '2026-03-25T10-30-00-000Z'; + const runDir = path.join(runsDir, filename); + mkdirSync(runDir, { recursive: true }); + writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(RESULT_A)); + writeFileSync( + path.join(runDir, 'tags.json'), + `${JSON.stringify( + { + tags: ['accepted'], + updated_at: '2026-06-21T10:15:00.000Z', + oplog_watermark: { + ref: RUN_OPLOG_REF, + operation_id: 'op-before-clear', + updated_at: '2026-06-21T10:15:00.000Z', + }, + }, + null, + 2, + )}\n`, + ); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + + const deleteRes = await app.request(`/api/runs/${encodeURIComponent(filename)}/tags`, { + method: 'DELETE', + }); + expect(deleteRes.status).toBe(200); + const deleteData = (await deleteRes.json()) as { + ok: boolean; + tags: string[]; + final_state: { lifecycle: string; tags: string[] }; + oplog_watermark: { ref: string; operation_id?: string; updated_at?: string }; + updated_at: string; + }; + expect(deleteData.ok).toBe(true); + expect(deleteData.tags).toEqual([]); + expect(deleteData.final_state).toEqual({ + lifecycle: 'active', + tags: [], + }); + expect(deleteData.oplog_watermark.ref).toBe(RUN_OPLOG_REF); + expect(deleteData.oplog_watermark.operation_id).toBeString(); + expect(deleteData.oplog_watermark.operation_id).not.toBe('op-before-clear'); + expect(deleteData.oplog_watermark.updated_at).toBe(deleteData.updated_at); + + const tagFile = JSON.parse(readFileSync(path.join(runDir, 'tags.json'), 'utf8')) as { + tags: string[]; + oplog_watermark: { ref: string; operation_id?: string; updated_at?: string }; + }; + expect(tagFile.tags).toEqual([]); + expect(tagFile.oplog_watermark.operation_id).toBe(deleteData.oplog_watermark.operation_id); + + const reloadedApp = createApp([], tempDir, tempDir, undefined, { studioDir }); + const detailRes = await reloadedApp.request(`/api/runs/${encodeURIComponent(filename)}`); + expect(detailRes.status).toBe(200); + const detailData = (await detailRes.json()) as { + tags: string[]; + final_state: { lifecycle: string; tags: string[] }; + oplog_watermark: { ref: string; operation_id?: string; updated_at?: string }; + }; + expect(detailData).toMatchObject({ + tags: [], + final_state: { + lifecycle: 'active', + tags: [], + }, + oplog_watermark: { + ref: RUN_OPLOG_REF, + operation_id: deleteData.oplog_watermark.operation_id, + updated_at: deleteData.oplog_watermark.updated_at, + }, }); }); @@ -2538,6 +2706,296 @@ describe('serve app', () => { }); }); + describe('GET /api/runs/:filename/evals/:evalId/transcript', () => { + it('loads canonical transcript JSONL lazily from the manifest pointer', async () => { + const runsDir = path.join(tempDir, '.agentv', 'results', 'runs', 'with-transcript'); + const runId = 'with-transcript::2026-03-25T10-00-00-000Z'; + const timestampDir = path.join(runsDir, '2026-03-25T10-00-00-000Z'); + const transcriptArtifactPath = 'demo/test-greeting/outputs/transcript.jsonl'; + const answerArtifactPath = 'demo/test-greeting/outputs/answer.md'; + const transcriptPath = path.join(timestampDir, transcriptArtifactPath); + const answerPath = path.join(timestampDir, answerArtifactPath); + const transcriptJsonl = `${JSON.stringify({ + test_id: 'test-greeting', + target: 'gpt-4o', + message_index: 0, + role: 'user', + content: 'Hello', + })}\n`; + + mkdirSync(path.dirname(transcriptPath), { recursive: true }); + writeFileSync(transcriptPath, transcriptJsonl); + writeFileSync(answerPath, 'Hello, Alice!'); + writeFileSync( + path.join(timestampDir, 'index.jsonl'), + toJsonl({ + ...RESULT_A, + experiment: 'with-transcript', + transcript_path: transcriptArtifactPath, + answer_path: answerArtifactPath, + }), + ); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + const res = await app.request( + `/api/runs/${encodeURIComponent(runId)}/evals/test-greeting/transcript`, + ); + + expect(res.status).toBe(200); + const data = (await res.json()) as { + status: string; + transcript_path: string; + content: string; + answer_path: string; + answer_content: string; + }; + expect(data).toMatchObject({ + status: 'ok', + transcript_path: transcriptArtifactPath, + content: transcriptJsonl, + answer_path: answerArtifactPath, + answer_content: 'Hello, Alice!', + }); + }); + + it('loads pointer-shaped transcript metadata when it resolves to a local artifact path', async () => { + const runsDir = path.join(tempDir, '.agentv', 'results', 'runs', 'pointer-transcript'); + const runId = 'pointer-transcript::2026-03-25T11-00-00-000Z'; + const timestampDir = path.join(runsDir, '2026-03-25T11-00-00-000Z'); + const artifactPath = 'demo/test-greeting/outputs/transcript.jsonl'; + const transcriptPath = path.join(timestampDir, artifactPath); + const transcriptJsonl = `${JSON.stringify({ + test_id: 'test-greeting', + target: 'gpt-4o', + message_index: 0, + role: 'assistant', + content: 'Hello', + })}\n`; + + mkdirSync(path.dirname(transcriptPath), { recursive: true }); + writeFileSync(transcriptPath, transcriptJsonl); + writeFileSync( + path.join(timestampDir, 'index.jsonl'), + toJsonl({ + ...RESULT_A, + experiment: 'pointer-transcript', + artifact_pointers: { + transcript: { + ref: 'agentv/artifacts/v1', + path: artifactPath, + }, + }, + }), + ); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + const res = await app.request( + `/api/runs/${encodeURIComponent(runId)}/evals/test-greeting/transcript`, + ); + + expect(res.status).toBe(200); + const data = (await res.json()) as { + status: string; + transcript_path: string; + content: string; + pointer: string; + }; + expect(data.status).toBe('ok'); + expect(data.transcript_path).toBe(artifactPath); + expect(data.content).toBe(transcriptJsonl); + expect(data.pointer).toContain('agentv/artifacts/v1'); + }); + + it('returns a clear missing state when no transcript pointer is recorded', async () => { + const runId = writeLocalRunArtifact( + tempDir, + 'missing-transcript', + '2026-03-25T12-00-00-000Z', + RESULT_A, + ); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + const res = await app.request( + `/api/runs/${encodeURIComponent(runId)}/evals/test-greeting/transcript`, + ); + + expect(res.status).toBe(200); + const data = (await res.json()) as { status: string; message: string }; + expect(data.status).toBe('missing'); + expect(data.message).toContain('outputs/transcript.jsonl'); + }); + + it('returns a clear dangling state when the transcript pointer cannot be read', async () => { + const runsDir = path.join(tempDir, '.agentv', 'results', 'runs', 'dangling-transcript'); + const runId = 'dangling-transcript::2026-03-25T13-00-00-000Z'; + const timestampDir = path.join(runsDir, '2026-03-25T13-00-00-000Z'); + const artifactPath = 'demo/test-greeting/outputs/transcript.jsonl'; + + mkdirSync(timestampDir, { recursive: true }); + writeFileSync( + path.join(timestampDir, 'index.jsonl'), + toJsonl({ + ...RESULT_A, + experiment: 'dangling-transcript', + transcript_path: artifactPath, + }), + ); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + const res = await app.request( + `/api/runs/${encodeURIComponent(runId)}/evals/test-greeting/transcript`, + ); + + expect(res.status).toBe(200); + const data = (await res.json()) as { + status: string; + transcript_path: string; + message: string; + }; + expect(data.status).toBe('dangling'); + expect(data.transcript_path).toBe(artifactPath); + expect(data.message).toContain('not available'); + }); + + it('treats symlinked transcript artifacts outside the run workspace as dangling', async () => { + const secret = 'outside transcript secret'; + const outsidePath = path.join(tempDir, 'outside-transcript.jsonl'); + writeFileSync(outsidePath, secret); + + const runsDir = path.join(tempDir, '.agentv', 'results', 'runs', 'escaped-transcript'); + const runId = 'escaped-transcript::2026-03-25T13-30-00-000Z'; + const timestampDir = path.join(runsDir, '2026-03-25T13-30-00-000Z'); + const artifactPath = 'demo/test-greeting/outputs/transcript.jsonl'; + const symlinkPath = path.join(timestampDir, artifactPath); + + mkdirSync(path.dirname(symlinkPath), { recursive: true }); + symlinkSync(outsidePath, symlinkPath); + writeFileSync( + path.join(timestampDir, 'index.jsonl'), + toJsonl({ + ...RESULT_A, + experiment: 'escaped-transcript', + transcript_path: artifactPath, + }), + ); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + const res = await app.request( + `/api/runs/${encodeURIComponent(runId)}/evals/test-greeting/transcript`, + ); + + expect(res.status).toBe(200); + const text = await res.text(); + expect(text).not.toContain(secret); + const data = JSON.parse(text) as { status: string; transcript_path: string }; + expect(data.status).toBe('dangling'); + expect(data.transcript_path).toBe(artifactPath); + }); + + it('omits symlinked answer artifacts outside the run workspace from transcript responses', async () => { + const secret = 'outside answer secret'; + const outsidePath = path.join(tempDir, 'outside-answer.md'); + writeFileSync(outsidePath, secret); + + const runsDir = path.join(tempDir, '.agentv', 'results', 'runs', 'escaped-answer'); + const runId = 'escaped-answer::2026-03-25T13-45-00-000Z'; + const timestampDir = path.join(runsDir, '2026-03-25T13-45-00-000Z'); + const transcriptArtifactPath = 'demo/test-greeting/outputs/transcript.jsonl'; + const answerArtifactPath = 'demo/test-greeting/outputs/answer.md'; + const transcriptPath = path.join(timestampDir, transcriptArtifactPath); + const answerPath = path.join(timestampDir, answerArtifactPath); + const transcriptJsonl = `${JSON.stringify({ + test_id: 'test-greeting', + target: 'gpt-4o', + message_index: 0, + role: 'user', + content: 'Hello', + })}\n`; + + mkdirSync(path.dirname(transcriptPath), { recursive: true }); + writeFileSync(transcriptPath, transcriptJsonl); + symlinkSync(outsidePath, answerPath); + writeFileSync( + path.join(timestampDir, 'index.jsonl'), + toJsonl({ + ...RESULT_A, + experiment: 'escaped-answer', + transcript_path: transcriptArtifactPath, + answer_path: answerArtifactPath, + }), + ); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + const res = await app.request( + `/api/runs/${encodeURIComponent(runId)}/evals/test-greeting/transcript`, + ); + + expect(res.status).toBe(200); + const text = await res.text(); + expect(text).not.toContain(secret); + const data = JSON.parse(text) as { + status: string; + content: string; + answer_path: string; + answer_content?: string; + }; + expect(data.status).toBe('ok'); + expect(data.content).toBe(transcriptJsonl); + expect(data.answer_path).toBe(answerArtifactPath); + expect(data.answer_content).toBeUndefined(); + }); + + it('does not read transcript bodies for list, detail, or aggregate routes', async () => { + const timestamp = '2026-03-25T14-00-00-000Z'; + const transcriptArtifactPath = 'demo/test-greeting/outputs/transcript.jsonl'; + const runId = writeLocalRunArtifact(tempDir, 'lazy-guard', timestamp, { + ...RESULT_A, + transcript_path: transcriptArtifactPath, + }); + const timestampDir = path.join( + tempDir, + '.agentv', + 'results', + 'runs', + 'lazy-guard', + timestamp, + ); + mkdirSync(path.join(timestampDir, transcriptArtifactPath), { recursive: true }); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + + const listRes = await app.request('/api/runs'); + expect(listRes.status).toBe(200); + const listData = (await listRes.json()) as { + runs: Array<{ filename: string; target?: string }>; + }; + expect(listData.runs.find((run) => run.filename === runId)?.target).toBe('gpt-4o'); + + const detailRes = await app.request(`/api/runs/${encodeURIComponent(runId)}`); + expect(detailRes.status).toBe(200); + const detailData = (await detailRes.json()) as { results: unknown[] }; + expect(detailData.results).toHaveLength(1); + + const compareRes = await app.request('/api/compare'); + expect(compareRes.status).toBe(200); + const compareData = (await compareRes.json()) as { + cells: Array<{ experiment: string; eval_count: number }>; + }; + expect(compareData.cells.find((cell) => cell.experiment === 'lazy-guard')?.eval_count).toBe( + 1, + ); + + const indexRes = await app.request('/api/index'); + expect(indexRes.status).toBe(200); + const indexData = (await indexRes.json()) as { + entries: Array<{ run_filename: string; total_cost_usd: number }>; + }; + expect(indexData.entries.find((entry) => entry.run_filename === runId)?.total_cost_usd).toBe( + RESULT_A.cost_usd, + ); + }); + }); + describe('GET /api/runs/:filename/evals/:evalId/files/*', () => { it('loads file content for experiment-scoped run ids', async () => { const runsDir = path.join(tempDir, '.agentv', 'results', 'runs', 'with-skills'); @@ -2612,6 +3070,37 @@ describe('serve app', () => { ); expect(await downloadRes.text()).toBe(transcriptJsonl); }); + + it('rejects symlinked artifact file reads outside the run workspace', async () => { + const secret = 'outside raw artifact secret'; + const outsidePath = path.join(tempDir, 'outside-response.md'); + writeFileSync(outsidePath, secret); + + const runsDir = path.join(tempDir, '.agentv', 'results', 'runs', 'escaped-file'); + const runId = 'escaped-file::2026-03-25T10-30-00-000Z'; + const timestampDir = path.join(runsDir, '2026-03-25T10-30-00-000Z'); + const artifactPath = 'demo/test-greeting/outputs/response.md'; + const symlinkPath = path.join(timestampDir, artifactPath); + + mkdirSync(path.dirname(symlinkPath), { recursive: true }); + symlinkSync(outsidePath, symlinkPath); + writeFileSync( + path.join(timestampDir, 'index.jsonl'), + toJsonl({ + ...RESULT_A, + experiment: 'escaped-file', + output_path: artifactPath, + }), + ); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + const res = await app.request( + `/api/runs/${encodeURIComponent(runId)}/evals/test-greeting/files/${artifactPath}?raw=1`, + ); + + expect(res.status).toBe(403); + expect(await res.text()).not.toContain(secret); + }); }); // ── GET /api/compare (tag filter) ─────────────────────────────────── diff --git a/apps/cli/test/commands/results/shared.test.ts b/apps/cli/test/commands/results/shared.test.ts index 12f64f61e..5e701116e 100644 --- a/apps/cli/test/commands/results/shared.test.ts +++ b/apps/cli/test/commands/results/shared.test.ts @@ -79,6 +79,55 @@ describe('results shared source resolution', () => { expect(results[0].trace.toolCalls).toEqual({ rg: 1 }); }); + it('hydrates transcripts from artifact pointers when transcript_path is absent', () => { + const runDir = path.join(tempDir, '.agentv', 'results', 'runs', '2026-03-25T10-00-00-000Z'); + const transcriptRelativePath = 'pointer-case/outputs/transcript.jsonl'; + mkdirSync(path.join(runDir, 'pointer-case', 'outputs'), { recursive: true }); + writeFileSync( + path.join(runDir, transcriptRelativePath), + `${JSON.stringify({ + schema_version: 'agentv.transcript.v1', + test_id: 'pointer-case', + target: 'codex', + message_index: 0, + role: 'assistant', + content: 'Loaded from pointer', + source: { provider: 'codex', session_id: 'session-pointer' }, + })}\n`, + ); + const indexPath = path.join(runDir, 'index.jsonl'); + writeFileSync( + indexPath, + `${JSON.stringify({ + timestamp: '2026-03-25T10:00:00.000Z', + test_id: 'pointer-case', + target: 'codex', + score: 1, + grading_path: 'pointer-case/grading.json', + timing_path: 'pointer-case/timing.json', + artifact_pointers: { + transcript: { + ref: 'agentv/artifacts/v1', + key: 'transcripts/pointer-case/outputs/transcript.jsonl', + object_version: 'sha256:test', + path: transcriptRelativePath, + sha256: 'test', + size: 1, + schema_version: 'agentv.transcript.v1', + media_type: 'application/x-ndjson', + family: 'transcripts', + }, + }, + })}\n`, + ); + + const results = loadManifestResults(indexPath); + + expect(results).toHaveLength(1); + expect(results[0].trace.messages[0]?.content).toBe('Loaded from pointer'); + expect(results[0].trace.messages[0]?.role).toBe('assistant'); + }); + it('rejects eval-case-only rows with migration guidance', () => { const runDir = path.join(tempDir, '.agentv', 'results', 'runs', '2026-03-25T10-00-00-000Z'); mkdirSync(runDir, { recursive: true }); diff --git a/apps/dashboard/src/components/AnalyticsTab.tsx b/apps/dashboard/src/components/AnalyticsTab.tsx index 4c45c48aa..0378e4d76 100644 --- a/apps/dashboard/src/components/AnalyticsTab.tsx +++ b/apps/dashboard/src/components/AnalyticsTab.tsx @@ -15,7 +15,7 @@ * Backend contract: * - `GET /api/compare` → { cells, runs? } * - `PUT /api/runs/:runId/tags` → replaces sidecar tags.json - * - `DELETE /api/runs/:runId/tags` → removes sidecar + * - `DELETE /api/runs/:runId/tags` → records an empty tag state * * To extend with a new mode: add a value to `ViewMode`, a button in the mode * toggle, and a new body component in the content switch. Hooks in any new diff --git a/apps/dashboard/src/components/EvalDetail.tsx b/apps/dashboard/src/components/EvalDetail.tsx index f1369b947..c9c8059a4 100644 --- a/apps/dashboard/src/components/EvalDetail.tsx +++ b/apps/dashboard/src/components/EvalDetail.tsx @@ -14,8 +14,10 @@ import { isPassing, projectEvalFileContentOptions, projectEvalFilesOptions, + projectEvalTranscriptOptions, useEvalFileContent, useEvalFiles, + useEvalTranscript, useStudioConfig, } from '~/lib/api'; import type { @@ -32,12 +34,7 @@ import type { FileNode } from './FileTree'; import { FileTree } from './FileTree'; import { MonacoViewer } from './MonacoViewer'; import { ScoreBar } from './ScoreBar'; -import { - TranscriptTimeline, - findAnswerPath, - findTranscriptPath, - parseTranscriptJsonl, -} from './TranscriptTimeline'; +import { TranscriptTimeline, parseTranscriptJsonl } from './TranscriptTimeline'; interface EvalDetailProps { eval: EvalResult; @@ -457,49 +454,68 @@ function TranscriptTab({ onOpenFile: (path: string) => void; }) { const evalId = result.testId; - const { data: filesData, isLoading: isLoadingFiles } = projectId - ? useQuery(projectEvalFilesOptions(projectId, runId, evalId)) - : useEvalFiles(runId, evalId); - const files = filesData?.files ?? []; - const transcriptPath = findTranscriptPath(files); - const answerPath = findAnswerPath(files); - - const { data: transcriptContentData, isLoading: isLoadingTranscript } = projectId - ? useQuery(projectEvalFileContentOptions(projectId, runId, evalId, transcriptPath ?? '')) - : useEvalFileContent(runId, evalId, transcriptPath ?? ''); - const { data: answerContentData } = projectId - ? useQuery(projectEvalFileContentOptions(projectId, runId, evalId, answerPath ?? '')) - : useEvalFileContent(runId, evalId, answerPath ?? ''); + const { + data: transcriptData, + isLoading: isLoadingTranscript, + error: transcriptError, + } = projectId + ? useQuery(projectEvalTranscriptOptions(projectId, runId, evalId)) + : useEvalTranscript(runId, evalId); + const transcriptPath = transcriptData?.transcript_path; + const answerPath = transcriptData?.answer_path; + const transcriptContent = transcriptData?.status === 'ok' ? (transcriptData.content ?? '') : ''; const parsedTranscript = useMemo( - () => parseTranscriptJsonl(transcriptContentData?.content ?? ''), - [transcriptContentData?.content], + () => parseTranscriptJsonl(transcriptContent), + [transcriptContent], ); - if (isLoadingFiles) { + if (isLoadingTranscript) { return (
- Loading transcript artifacts... + Loading transcript artifact...
); } - if (!transcriptPath) { + if (transcriptError) { + return ( +
+

Transcript could not be loaded

+

{transcriptError.message}

+
+ ); + } + + if (!transcriptData || transcriptData.status === 'missing') { return (

No structured transcript

- This run does not include canonical outputs/transcript.jsonl. Dashboard does - not parse response.md or markdown transcripts for this view. + {transcriptData?.message ?? + 'This run does not include canonical outputs/transcript.jsonl. Dashboard does not parse response.md or markdown transcripts for this view.'}

); } - if (isLoadingTranscript) { + if (transcriptData.status === 'dangling' || transcriptData.status === 'unsupported') { return ( -
- Loading {transcriptPath}... +
+

+ {transcriptData.status === 'dangling' + ? 'Transcript artifact unavailable' + : 'Transcript pointer unsupported'} +

+

+ {transcriptData.message ?? 'The transcript artifact could not be resolved.'} +

+ {transcriptPath ? ( +

{transcriptPath}

+ ) : null} + {transcriptData.pointer ? ( +

{transcriptData.pointer}

+ ) : null}
); } @@ -510,27 +526,31 @@ function TranscriptTab({

Transcript could not be parsed

{parsedTranscript.error}

- - - Open raw JSONL - + {transcriptPath ? ( + <> + + + Open raw JSONL + + + ) : null}
); @@ -550,25 +570,29 @@ function TranscriptTab({ const answerHref = answerPath ? artifactFileContentUrl({ projectId, runId, evalId, filePath: answerPath, raw: true }) : undefined; - const transcriptHref = artifactFileContentUrl({ - projectId, - runId, - evalId, - filePath: transcriptPath, - raw: true, - }); - const transcriptDownloadHref = artifactFileContentUrl({ - projectId, - runId, - evalId, - filePath: transcriptPath, - download: true, - }); + const transcriptHref = transcriptPath + ? artifactFileContentUrl({ + projectId, + runId, + evalId, + filePath: transcriptPath, + raw: true, + }) + : undefined; + const transcriptDownloadHref = transcriptPath + ? artifactFileContentUrl({ + projectId, + runId, + evalId, + filePath: transcriptPath, + download: true, + }) + : undefined; return ( (url: string): Promise { @@ -231,6 +232,17 @@ export function evalFileContentOptions(runId: string, evalId: string, filePath: }); } +export function evalTranscriptOptions(runId: string, evalId: string) { + return queryOptions({ + queryKey: ['runs', runId, 'evals', evalId, 'transcript'], + queryFn: () => + fetchJson( + `/api/runs/${encodeURIComponent(runId)}/evals/${encodeURIComponent(evalId)}/transcript`, + ), + enabled: !!runId && !!evalId, + }); +} + export function runCategoriesOptions(runId: string) { return queryOptions({ queryKey: ['runs', runId, 'categories'], @@ -321,6 +333,10 @@ export function useEvalFileContent(runId: string, evalId: string, filePath: stri return useQuery(evalFileContentOptions(runId, evalId, filePath)); } +export function useEvalTranscript(runId: string, evalId: string) { + return useQuery(evalTranscriptOptions(runId, evalId)); +} + export function useRunCategories(runId: string) { return useQuery(runCategoriesOptions(runId)); } @@ -553,6 +569,17 @@ export function projectEvalFileContentOptions( }); } +export function projectEvalTranscriptOptions(projectId: string, runId: string, evalId: string) { + return queryOptions({ + queryKey: ['projects', projectId, 'runs', runId, 'evals', evalId, 'transcript'], + queryFn: () => + fetchJson( + `${projectApiBase(projectId)}/runs/${encodeURIComponent(runId)}/evals/${encodeURIComponent(evalId)}/transcript`, + ), + enabled: !!projectId && !!runId && !!evalId, + }); +} + export function projectExperimentsOptions(projectId: string) { return queryOptions({ queryKey: ['projects', projectId, 'experiments'], @@ -665,7 +692,7 @@ export async function deleteRunApi(runId: string, projectId?: string): Promise; } -/** Remove the tags sidecar for a run. */ +/** Clear the tags for a run while preserving the clear watermark. */ export async function deleteRunTagsApi(runId: string, projectId?: string): Promise { const url = projectId ? `${projectApiBase(projectId)}/runs/${encodeURIComponent(runId)}/tags` diff --git a/apps/dashboard/src/lib/trace-read-model.test.ts b/apps/dashboard/src/lib/trace-read-model.test.ts new file mode 100644 index 000000000..68127da36 --- /dev/null +++ b/apps/dashboard/src/lib/trace-read-model.test.ts @@ -0,0 +1,295 @@ +import { describe, expect, it } from 'bun:test'; + +import { + traceSessionEnvelopeFixture, + traceSessionMissingOptionalFixture, +} from './__fixtures__/trace-session-read-model'; +import { + type TraceSpanNode, + buildTraceSpanTree, + traceEnvelopeToTraceSessionResponse, +} from './trace-read-model'; + +function expectSnakeCaseFixtureKeys(value: unknown, path: string[] = []): void { + if (Array.isArray(value)) { + value.forEach((entry, index) => expectSnakeCaseFixtureKeys(entry, [...path, String(index)])); + return; + } + if (!value || typeof value !== 'object') { + return; + } + + for (const [key, entry] of Object.entries(value)) { + const parentKey = path.at(-1); + if (parentKey !== 'attributes') { + expect(key, [...path, key].join('.')).toMatch(/^[a-z][a-z0-9_]*$/); + } + expectSnakeCaseFixtureKeys(entry, [...path, key]); + } +} + +function flattenTree(nodes: readonly TraceSpanNode[]): TraceSpanNode[] { + return nodes.flatMap((node) => [node, ...flattenTree(node.children)]); +} + +describe('trace session read model', () => { + it('projects snake_case trace artifacts into stable Dashboard span trees', () => { + const session = traceEnvelopeToTraceSessionResponse(traceSessionEnvelopeFixture, { + artifactPath: 'nested-session__codex/outputs/trace.json', + }); + const tree = buildTraceSpanTree(session.spans); + + expect(session).toMatchObject({ + schema_version: 'agentv.dashboard.trace_session.v1', + run_id: '2026-06-21T10-00-00-000Z', + test_id: 'nested-session', + target: 'codex', + trace_id: 'trace-123', + root_span_id: 'root-span', + source: { + artifact_path: 'nested-session__codex/outputs/trace.json', + }, + }); + expect(session.spans.map((span) => span.id)).toEqual([ + 'root-span', + 'child-chat', + 'grandchild-tool', + ]); + expect(session.spans.map((span) => span.parent_span_id)).toEqual([ + null, + 'root-span', + 'child-chat', + ]); + expect(tree).toHaveLength(1); + expect(tree[0].spanId).toBe('root-span'); + expect(tree[0].children[0].spanId).toBe('child-chat'); + expect(tree[0].children[0].children[0].spanId).toBe('grandchild-tool'); + }); + + it('preserves score events, annotation events, scores, and unknown attributes', () => { + const session = traceEnvelopeToTraceSessionResponse(traceSessionEnvelopeFixture); + const root = session.spans.find((span) => span.span_id === 'root-span'); + + expect(root?.duration_ms).toBe(1500); + expect(root?.token_usage).toEqual({ input: 14, output: 9 }); + expect(root?.attributes?.['custom.unknown_value']).toEqual({ nested_value: true }); + expect(root?.attributes?.['gen_ai.usage.input_tokens']).toBe(14); + expect(root?.attributes).not.toHaveProperty('external_trace_url'); + expect(root?.attributes).not.toHaveProperty('external_trace_token'); + expect(root?.attributes).not.toHaveProperty('access_token'); + + expect(session.events.map((event) => [event.event_id, event.kind, event.name])).toEqual([ + ['annotation-1', 'annotation', 'agentv.annotation'], + ['score-1', 'score', 'agentv.score'], + ]); + expect(session.events[0]).toMatchObject({ + text: 'Reviewer note', + passed: true, + attributes: { extra_context: { source: 'grader' }, nested: { safe_value: 'visible' } }, + }); + expect(session.events[1]).toMatchObject({ + score: 0.82, + text: 'Rubric score', + passed: true, + }); + expect(session.scores).toEqual([ + { + name: 'rubric', + type: 'llm-grader', + score: 0.82, + weight: 1, + verdict: 'pass', + source: 'llm', + evaluated_at: '2026-06-21T10:00:02.300Z', + target_span_id: 'root-span', + evidence: { + assertions: [{ text: 'Rubric score', passed: true }], + }, + }, + ]); + }); + + it('keeps external_trace links safe and leaves AgentV as canonical source', () => { + const session = traceEnvelopeToTraceSessionResponse(traceSessionEnvelopeFixture); + + expect(session.external_trace).toEqual({ + provider: 'phoenix', + project: 'agentv-dogfood', + session_id: 'codex-session-123', + trace_id: 'phoenix-trace-456', + url: 'https://phoenix.example/projects/agentv-dogfood/traces/phoenix-trace-456', + }); + expect(JSON.stringify(session.external_trace)).not.toContain('secret'); + expect(JSON.stringify(session.external_trace)).not.toContain('api_key'); + expect(JSON.stringify(session)).not.toContain('secret'); + expect(JSON.stringify(session)).not.toContain('api_key'); + expect(session.source?.metadata).toEqual({ + safe_note: 'local artifact remains canonical', + }); + }); + + it('does not invent zero timing, token usage, or broken external links for missing fields', () => { + const session = traceEnvelopeToTraceSessionResponse(traceSessionMissingOptionalFixture); + const root = session.spans[0]; + + expect(root.start_time_unix_nano).toBeUndefined(); + expect(root.end_time_unix_nano).toBeUndefined(); + expect(root.start_time).toBeUndefined(); + expect(root.end_time).toBeUndefined(); + expect(root.duration_ms).toBeUndefined(); + expect(root.token_usage).toBeUndefined(); + expect(session.external_trace).toEqual({ + provider: 'codex', + session_id: 'codex-session-789', + }); + expect(JSON.stringify(session.external_trace)).not.toContain('secret'); + expect(JSON.stringify(session.external_trace)).not.toContain('not-a-url'); + }); + + it('preserves duplicate span IDs with collision-free node IDs and diagnostics', () => { + const tree = buildTraceSpanTree([ + { + id: 'root', + span_id: 'root', + parent_span_id: null, + name: 'root', + start_time_unix_nano: '1000', + }, + { + id: 'dup', + span_id: 'dup', + parent_span_id: 'root', + name: 'first duplicate', + start_time_unix_nano: '1100', + }, + { + id: 'dup', + span_id: 'dup', + parent_span_id: 'root', + name: 'second duplicate', + start_time_unix_nano: '1200', + }, + ]); + const nodes = flattenTree(tree); + + expect(nodes.map((node) => node.id)).toEqual(['root', 'dup', 'dup#2']); + expect(nodes.map((node) => node.span.name)).toEqual([ + 'root', + 'first duplicate', + 'second duplicate', + ]); + expect(nodes[2].diagnostics?.map((diagnostic) => diagnostic.code)).toEqual([ + 'duplicate_span_id', + ]); + }); + + it('promotes self-parented spans and ancestor cycles to diagnostic roots', () => { + const tree = buildTraceSpanTree([ + { + id: 'self', + span_id: 'self', + parent_span_id: 'self', + name: 'self', + start_time_unix_nano: '3000', + }, + { + id: 'cycle-a', + span_id: 'cycle-a', + parent_span_id: 'cycle-b', + name: 'cycle-a', + start_time_unix_nano: '1000', + }, + { + id: 'cycle-b', + span_id: 'cycle-b', + parent_span_id: 'cycle-a', + name: 'cycle-b', + start_time_unix_nano: '2000', + }, + ]); + const nodes = flattenTree(tree); + + expect(tree.map((node) => node.spanId)).toEqual(['cycle-a', 'cycle-b', 'self']); + expect(nodes.every((node) => node.children.length === 0)).toBe(true); + expect(nodes.map((node) => node.diagnostics?.[0]?.code)).toEqual([ + 'cycle', + 'cycle', + 'self_parent', + ]); + }); + + it('keeps missing-ID and missing-parent spans as diagnostic roots', () => { + const tree = buildTraceSpanTree([ + { + id: '', + span_id: '', + parent_span_id: null, + name: 'missing id', + }, + { + id: 'orphan', + span_id: 'orphan', + parent_span_id: 'missing-parent', + name: 'orphan', + }, + ]); + + expect(tree.map((node) => node.id)).toEqual(['missing-span-0', 'orphan']); + expect(tree.map((node) => node.diagnostics?.[0]?.code)).toEqual([ + 'missing_span_id', + 'missing_parent', + ]); + }); + + it('sorts roots and children by start time with stable span ID tie breaks', () => { + const tree = buildTraceSpanTree([ + { + id: 'root-b', + span_id: 'root-b', + parent_span_id: null, + name: 'root-b', + start_time_unix_nano: '2000', + }, + { + id: 'child-late', + span_id: 'child-late', + parent_span_id: 'root-a', + name: 'child-late', + start_time_unix_nano: '1200', + }, + { + id: 'root-a', + span_id: 'root-a', + parent_span_id: null, + name: 'root-a', + start_time_unix_nano: '1000', + }, + { + id: 'child-early', + span_id: 'child-early', + parent_span_id: 'root-a', + name: 'child-early', + start_time_unix_nano: '1100', + }, + { + id: 'child-alpha', + span_id: 'child-alpha', + parent_span_id: 'root-a', + name: 'child-alpha', + start_time_unix_nano: '1200', + }, + ]); + + expect(tree.map((node) => node.spanId)).toEqual(['root-a', 'root-b']); + expect(tree[0].children.map((node) => node.spanId)).toEqual([ + 'child-early', + 'child-alpha', + 'child-late', + ]); + }); + + it('keeps new API fixtures snake_case-only outside opaque attributes maps', () => { + expectSnakeCaseFixtureKeys(traceSessionEnvelopeFixture); + expectSnakeCaseFixtureKeys(traceSessionMissingOptionalFixture); + }); +}); diff --git a/apps/dashboard/src/lib/trace-read-model.ts b/apps/dashboard/src/lib/trace-read-model.ts new file mode 100644 index 000000000..b14fa4c2b --- /dev/null +++ b/apps/dashboard/src/lib/trace-read-model.ts @@ -0,0 +1,696 @@ +import type { + ExternalTraceMetadata, + TraceSessionEvent, + TraceSessionEventKind, + TraceSessionResponse, + TraceSessionScore, + TraceSessionSource, + TraceSessionSpan, + TraceSessionTokenUsage, +} from './types'; + +export const TRACE_SESSION_SCHEMA_VERSION = 'agentv.dashboard.trace_session.v1' as const; + +export interface TraceSessionProjectionOptions { + runId?: string; + artifactPath?: string; +} + +export interface TraceSpanNode { + id: string; + spanId: string; + parentSpanId?: string | null; + span: TraceSessionSpan; + children: TraceSpanNode[]; + diagnostics?: TraceSpanTreeDiagnostic[]; +} + +export type TraceSpanTreeDiagnosticCode = + | 'cycle' + | 'duplicate_span_id' + | 'missing_parent' + | 'missing_span_id' + | 'self_parent'; + +export interface TraceSpanTreeDiagnostic { + code: TraceSpanTreeDiagnosticCode; + message: string; + span_id?: string; + node_id?: string; + parent_span_id?: string; +} + +function isRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value); +} + +function asRecord(value: unknown): Record | undefined { + return isRecord(value) ? value : undefined; +} + +function asArray(value: unknown): unknown[] { + return Array.isArray(value) ? value : []; +} + +function stringValue(value: unknown): string | undefined { + return typeof value === 'string' && value.length > 0 ? value : undefined; +} + +function finiteNumber(value: unknown): number | undefined { + return typeof value === 'number' && Number.isFinite(value) ? value : undefined; +} + +function boolValue(value: unknown): boolean | undefined { + return typeof value === 'boolean' ? value : undefined; +} + +function dropUndefined>(value: T): T { + return Object.fromEntries(Object.entries(value).filter(([, entry]) => entry !== undefined)) as T; +} + +function compactRecord(value: Record): Record | undefined { + const compacted = dropUndefined(value); + return Object.keys(compacted).length > 0 ? compacted : undefined; +} + +function nonEmptyArray(value: readonly T[] | undefined): readonly T[] | undefined { + return value && value.length > 0 ? value : undefined; +} + +function unixNanoToIso(value: string | undefined): string | undefined { + if (!value) { + return undefined; + } + try { + return new Date(Number(BigInt(value) / 1_000_000n)).toISOString(); + } catch { + return undefined; + } +} + +function durationMsFromNanos( + start: string | undefined, + end: string | undefined, +): number | undefined { + if (!start || !end) { + return undefined; + } + try { + const startNanos = BigInt(start); + const endNanos = BigInt(end); + if (endNanos < startNanos) { + return undefined; + } + return Number(endNanos - startNanos) / 1_000_000; + } catch { + return undefined; + } +} + +function numberFromAttributes( + attributes: Record, + keys: readonly string[], +): number | undefined { + for (const key of keys) { + const value = finiteNumber(attributes[key]); + if (value !== undefined) { + return value; + } + } + return undefined; +} + +function tokenUsageFromAttributes( + attributes: Record | undefined, +): TraceSessionTokenUsage | undefined { + if (!attributes) { + return undefined; + } + + const nested = asRecord(attributes.token_usage); + const usage = compactRecord({ + input: + finiteNumber(nested?.input) ?? + numberFromAttributes(attributes, [ + 'gen_ai.usage.input_tokens', + 'llm.token_count.prompt', + 'input_tokens', + ]), + output: + finiteNumber(nested?.output) ?? + numberFromAttributes(attributes, [ + 'gen_ai.usage.output_tokens', + 'llm.token_count.completion', + 'output_tokens', + ]), + reasoning: + finiteNumber(nested?.reasoning) ?? + numberFromAttributes(attributes, [ + 'gen_ai.usage.reasoning.output_tokens', + 'reasoning_tokens', + ]), + cached: + finiteNumber(nested?.cached) ?? + numberFromAttributes(attributes, ['gen_ai.usage.cache_read.input_tokens', 'cached_tokens']), + total: finiteNumber(nested?.total) ?? numberFromAttributes(attributes, ['total_tokens']), + }); + + return usage as TraceSessionTokenUsage | undefined; +} + +function isExternalTraceKey(key: string): boolean { + return ( + key === 'external_trace' || + key.startsWith('external_trace_') || + key.startsWith('external_trace.') + ); +} + +function isCredentialLikeKey(key: string): boolean { + const normalized = key.toLowerCase(); + if ( + normalized === 'token_usage' || + normalized.endsWith('_tokens') || + normalized.endsWith('.tokens') || + normalized.includes('usage.') + ) { + return false; + } + return /(^|[._-])(api[._-]?key|authorization|bearer|password|secret|private[._-]?key|access[._-]?token|auth[._-]?token|client[._-]?secret|id[._-]?token|refresh[._-]?token|session[._-]?token|token)($|[._-])/.test( + normalized, + ); +} + +function sanitizeAttributeMap( + value: Record | undefined, +): Record | undefined { + if (!value) { + return undefined; + } + const entries = Object.entries(value).flatMap(([key, entry]) => { + if (isExternalTraceKey(key) || isCredentialLikeKey(key)) { + return []; + } + if (isRecord(entry)) { + const nested = sanitizeAttributeMap(entry); + return nested ? [[key, nested] as const] : []; + } + return [[key, entry] as const]; + }); + return entries.length > 0 ? Object.fromEntries(entries) : undefined; +} + +function spanStatusFromValue(value: unknown): TraceSessionSpan['status'] { + const record = asRecord(value); + if (!record) { + return undefined; + } + return compactRecord({ + code: + stringValue(record.code) ?? + (typeof record.code === 'number' ? String(record.code) : undefined), + message: stringValue(record.message), + }) as TraceSessionSpan['status']; +} + +function eventKind( + name: string, + attributes: Record | undefined, +): TraceSessionEventKind { + const lowerName = name.toLowerCase(); + if ( + lowerName.includes('score') || + finiteNumber(attributes?.score) !== undefined || + finiteNumber(attributes?.['agentv.score']) !== undefined || + finiteNumber(attributes?.['agentv.grader.score']) !== undefined + ) { + return 'score'; + } + if ( + lowerName.includes('annotation') || + stringValue(attributes?.text) !== undefined || + stringValue(attributes?.annotation) !== undefined || + stringValue(attributes?.['agentv.annotation.text']) !== undefined + ) { + return 'annotation'; + } + if (lowerName === 'exception') { + return 'exception'; + } + return 'event'; +} + +function scoreFromEvent(attributes: Record | undefined): number | undefined { + if (!attributes) { + return undefined; + } + return ( + finiteNumber(attributes.score) ?? + finiteNumber(attributes['agentv.score']) ?? + finiteNumber(attributes['agentv.grader.score']) + ); +} + +function textFromEvent(attributes: Record | undefined): string | undefined { + if (!attributes) { + return undefined; + } + return ( + stringValue(attributes.text) ?? + stringValue(attributes.annotation) ?? + stringValue(attributes['agentv.annotation.text']) ?? + stringValue(attributes['exception.message']) + ); +} + +function passedFromEvent(attributes: Record | undefined): boolean | undefined { + if (!attributes) { + return undefined; + } + return boolValue(attributes.passed) ?? boolValue(attributes['agentv.annotation.passed']); +} + +function eventId( + spanId: string, + index: number, + attributes: Record | undefined, +): string { + return ( + stringValue(attributes?.event_id) ?? + stringValue(attributes?.['agentv.event_id']) ?? + `${spanId}:event:${index}` + ); +} + +function projectSpanEvent( + spanId: string, + event: unknown, + index: number, +): TraceSessionEvent | undefined { + const record = asRecord(event); + if (!record) { + return undefined; + } + const name = stringValue(record.name); + if (!name) { + return undefined; + } + + const attributes = asRecord(record.attributes); + const safeAttributes = sanitizeAttributeMap(attributes); + return dropUndefined({ + event_id: eventId(spanId, index, attributes), + span_id: spanId, + name, + kind: eventKind(name, attributes), + time_unix_nano: stringValue(record.time_unix_nano), + timestamp: unixNanoToIso(stringValue(record.time_unix_nano)), + score: scoreFromEvent(attributes), + text: textFromEvent(attributes), + passed: passedFromEvent(attributes), + attributes: safeAttributes, + }); +} + +function projectSpan(span: unknown, index: number): TraceSessionSpan | undefined { + const record = asRecord(span); + if (!record) { + return undefined; + } + + const spanId = stringValue(record.span_id) ?? `span-${index}`; + const traceId = stringValue(record.trace_id); + const parentSpanId = record.parent_span_id === null ? null : stringValue(record.parent_span_id); + const attributes = asRecord(record.attributes); + const safeAttributes = sanitizeAttributeMap(attributes); + const startTimeUnixNano = stringValue(record.start_time_unix_nano); + const endTimeUnixNano = stringValue(record.end_time_unix_nano); + const events = asArray(record.events) + .map((event, eventIndex) => projectSpanEvent(spanId, event, eventIndex)) + .filter((event): event is TraceSessionEvent => event !== undefined); + + return dropUndefined({ + id: spanId, + trace_id: traceId, + span_id: spanId, + parent_span_id: parentSpanId, + name: stringValue(record.name) ?? spanId, + kind: stringValue(record.kind), + status: spanStatusFromValue(record.status), + start_time_unix_nano: startTimeUnixNano, + end_time_unix_nano: endTimeUnixNano, + start_time: unixNanoToIso(startTimeUnixNano), + end_time: unixNanoToIso(endTimeUnixNano), + duration_ms: durationMsFromNanos(startTimeUnixNano, endTimeUnixNano), + token_usage: tokenUsageFromAttributes(attributes), + attributes: safeAttributes, + events: events.length > 0 ? events : undefined, + }); +} + +function projectScores(scores: unknown): TraceSessionScore[] | undefined { + const projected: TraceSessionScore[] = []; + + for (const score of asArray(scores)) { + const record = asRecord(score); + const name = stringValue(record?.name); + const value = finiteNumber(record?.score); + if (!record || !name || value === undefined) { + continue; + } + projected.push( + dropUndefined({ + name, + type: stringValue(record.type), + score: value, + weight: finiteNumber(record.weight), + verdict: stringValue(record.verdict), + source: stringValue(record.source), + evaluated_at: stringValue(record.evaluated_at), + target_span_id: stringValue(record.target_span_id), + evidence: asRecord(record.evidence), + }) as TraceSessionScore, + ); + } + + return projected.length > 0 ? projected : undefined; +} + +const EXTERNAL_TRACE_KEYS = ['provider', 'project', 'session_id', 'trace_id', 'url'] as const; + +function isSecretLikeKey(key: string): boolean { + return /(api[_-]?key|authorization|bearer|password|secret|token)/i.test(key); +} + +function sanitizeUrl(value: unknown): string | undefined { + const raw = stringValue(value); + if (!raw) { + return undefined; + } + try { + const url = new URL(raw); + if (!['http:', 'https:'].includes(url.protocol) || url.username || url.password) { + return undefined; + } + url.search = ''; + url.hash = ''; + return url.toString(); + } catch { + return undefined; + } +} + +function sanitizeExternalTrace(value: unknown): ExternalTraceMetadata | undefined { + const record = asRecord(value); + if (!record) { + return undefined; + } + + const sanitized = compactRecord({ + provider: stringValue(record.provider), + project: stringValue(record.project), + session_id: stringValue(record.session_id), + trace_id: stringValue(record.trace_id), + url: sanitizeUrl(record.url), + }) as ExternalTraceMetadata | undefined; + + return sanitized && EXTERNAL_TRACE_KEYS.some((key) => sanitized[key] !== undefined) + ? sanitized + : undefined; +} + +function externalTraceFromFlatMetadata( + metadata: Record | undefined, +): ExternalTraceMetadata | undefined { + if (!metadata) { + return undefined; + } + return sanitizeExternalTrace({ + provider: metadata.external_trace_provider ?? metadata['external_trace.provider'], + project: metadata.external_trace_project ?? metadata['external_trace.project'], + session_id: metadata.external_trace_session_id ?? metadata['external_trace.session_id'], + trace_id: metadata.external_trace_trace_id ?? metadata['external_trace.trace_id'], + url: metadata.external_trace_url ?? metadata['external_trace.url'], + }); +} + +function sanitizeMetadata( + value: Record | undefined, +): Record | undefined { + if (!value) { + return undefined; + } + const entries = Object.entries(value).flatMap(([key, entry]) => { + if (isExternalTraceKey(key) || isSecretLikeKey(key)) { + return []; + } + if (isRecord(entry)) { + const nested = sanitizeMetadata(entry); + return nested ? [[key, nested] as const] : []; + } + return [[key, entry] as const]; + }); + return entries.length > 0 ? Object.fromEntries(entries) : undefined; +} + +function sourceFromEnvelope( + source: Record | undefined, + artifactPath: string | undefined, +): TraceSessionSource | undefined { + if (!source && !artifactPath) { + return undefined; + } + return compactRecord({ + kind: stringValue(source?.kind), + path: stringValue(source?.path), + provider: stringValue(source?.provider), + format: stringValue(source?.format), + version: stringValue(source?.version), + artifact_path: artifactPath, + metadata: sanitizeMetadata(asRecord(source?.metadata)), + }) as TraceSessionSource | undefined; +} + +function externalTraceFromEnvelope( + envelope: Record, +): ExternalTraceMetadata | undefined { + const source = asRecord(envelope.source); + const sourceMetadata = asRecord(source?.metadata); + const trace = asRecord(envelope.trace); + const rootSpanId = stringValue(trace?.root_span_id); + const rootSpan = asArray(trace?.spans) + .map(asRecord) + .find((span) => stringValue(span?.span_id) === rootSpanId); + const rootAttributes = asRecord(rootSpan?.attributes); + + return ( + sanitizeExternalTrace(envelope.external_trace) ?? + sanitizeExternalTrace(sourceMetadata?.external_trace) ?? + externalTraceFromFlatMetadata(sourceMetadata) ?? + externalTraceFromFlatMetadata(rootAttributes) + ); +} + +export function traceEnvelopeToTraceSessionResponse( + input: unknown, + options: TraceSessionProjectionOptions = {}, +): TraceSessionResponse { + const envelope = asRecord(input) ?? {}; + const evaluation = asRecord(envelope.eval); + const trace = asRecord(envelope.trace); + const spans = asArray(trace?.spans) + .map(projectSpan) + .filter((span): span is TraceSessionSpan => span !== undefined); + const events = spans.flatMap((span) => span.events ?? []); + + return dropUndefined({ + schema_version: TRACE_SESSION_SCHEMA_VERSION, + artifact_id: stringValue(envelope.artifact_id), + created_at: stringValue(envelope.created_at), + run_id: options.runId ?? stringValue(evaluation?.run_id), + test_id: stringValue(evaluation?.test_id), + suite: stringValue(evaluation?.suite), + target: stringValue(evaluation?.target), + trace_id: stringValue(trace?.trace_id), + root_span_id: stringValue(trace?.root_span_id), + source: sourceFromEnvelope(asRecord(envelope.source), options.artifactPath), + external_trace: externalTraceFromEnvelope(envelope), + spans, + events, + scores: projectScores(envelope.scores), + }); +} + +export function buildTraceSpanTree(spans: readonly TraceSessionSpan[]): TraceSpanNode[] { + const nodes: TraceSpanNode[] = []; + const firstNodeBySpanId = new Map(); + const spanIdCounts = new Map(); + + spans.forEach((span, index) => { + const rawSpanId = stringValue(span.span_id); + const spanId = rawSpanId ?? `missing-span-${index}`; + const occurrence = (spanIdCounts.get(spanId) ?? 0) + 1; + spanIdCounts.set(spanId, occurrence); + + const node: TraceSpanNode = { + id: occurrence === 1 ? spanId : `${spanId}#${occurrence}`, + spanId, + parentSpanId: span.parent_span_id, + span, + children: [], + diagnostics: rawSpanId + ? undefined + : [ + { + code: 'missing_span_id', + message: 'Span was missing span_id and was assigned a stable node id.', + node_id: spanId, + }, + ], + }; + + if (occurrence > 1) { + addNodeDiagnostic(node, { + code: 'duplicate_span_id', + message: 'Duplicate span_id was preserved with a collision-free node id.', + span_id: spanId, + node_id: node.id, + }); + } + if (!firstNodeBySpanId.has(spanId)) { + firstNodeBySpanId.set(spanId, node); + } + nodes.push(node); + }); + + const parentByNodeId = new Map(); + for (const node of nodes) { + const parentSpanId = + typeof node.parentSpanId === 'string' && node.parentSpanId.length > 0 + ? node.parentSpanId + : undefined; + if (!parentSpanId) { + continue; + } + if (parentSpanId === node.spanId) { + addNodeDiagnostic(node, { + code: 'self_parent', + message: 'Span parent_span_id points to itself; span was promoted to a root.', + span_id: node.spanId, + node_id: node.id, + parent_span_id: parentSpanId, + }); + continue; + } + const parent = firstNodeBySpanId.get(parentSpanId); + if (!parent) { + addNodeDiagnostic(node, { + code: 'missing_parent', + message: 'Span parent_span_id was not present in this trace; span was promoted to a root.', + span_id: node.spanId, + node_id: node.id, + parent_span_id: parentSpanId, + }); + continue; + } + parentByNodeId.set(node.id, parent); + } + + const cyclicNodes: TraceSpanNode[] = []; + for (const node of nodes) { + if (hasAncestorCycle(node, parentByNodeId)) { + cyclicNodes.push(node); + } + } + for (const node of cyclicNodes) { + parentByNodeId.delete(node.id); + addNodeDiagnostic(node, { + code: 'cycle', + message: 'Span parent chain contains a cycle; span was promoted to a root.', + span_id: node.spanId, + node_id: node.id, + parent_span_id: typeof node.parentSpanId === 'string' ? node.parentSpanId : undefined, + }); + } + + const roots: TraceSpanNode[] = []; + for (const node of nodes) { + const parent = parentByNodeId.get(node.id); + if (parent) { + parent.children.push(node); + } else { + roots.push(node); + } + } + + sortTraceSpanNodes(roots); + return roots; +} + +function addNodeDiagnostic(node: TraceSpanNode, diagnostic: TraceSpanTreeDiagnostic): void { + node.diagnostics = [...(node.diagnostics ?? []), diagnostic]; +} + +function hasAncestorCycle( + node: TraceSpanNode, + parentByNodeId: ReadonlyMap, +): boolean { + const seen = new Set(); + let cursor = parentByNodeId.get(node.id); + while (cursor) { + if (cursor.id === node.id || seen.has(cursor.id)) { + return true; + } + seen.add(cursor.id); + cursor = parentByNodeId.get(cursor.id); + } + return false; +} + +function compareUnixNanoValue(first: string | undefined, second: string | undefined): number { + if (first === second) { + return 0; + } + if (!first) { + return 1; + } + if (!second) { + return -1; + } + try { + const firstValue = BigInt(first); + const secondValue = BigInt(second); + return firstValue < secondValue ? -1 : firstValue > secondValue ? 1 : 0; + } catch { + return first.localeCompare(second); + } +} + +function compareTraceSpanNodes(first: TraceSpanNode, second: TraceSpanNode): number { + const byStart = compareUnixNanoValue( + first.span.start_time_unix_nano, + second.span.start_time_unix_nano, + ); + if (byStart !== 0) { + return byStart; + } + if (first.spanId === second.parentSpanId) { + return -1; + } + if (second.spanId === first.parentSpanId) { + return 1; + } + const bySpanId = first.spanId.localeCompare(second.spanId); + return bySpanId !== 0 ? bySpanId : first.id.localeCompare(second.id); +} + +function sortTraceSpanNodes(nodes: TraceSpanNode[]): void { + nodes.sort(compareTraceSpanNodes); + for (const node of nodes) { + node.children.sort(compareTraceSpanNodes); + if (node.children.length > 0) { + sortTraceSpanNodes(node.children); + } + node.diagnostics = nonEmptyArray(node.diagnostics) as TraceSpanTreeDiagnostic[] | undefined; + } +} diff --git a/apps/dashboard/src/lib/types.ts b/apps/dashboard/src/lib/types.ts index 087836a37..a3d7d20b3 100644 --- a/apps/dashboard/src/lib/types.ts +++ b/apps/dashboard/src/lib/types.ts @@ -35,6 +35,10 @@ export interface RunMeta { pending_tags?: string[]; /** True when local editable metadata differs from the fetched remote metadata. */ metadata_dirty?: boolean; + /** Materialized final run state consumed by readers instead of folding raw operations. */ + final_state?: RunFinalState; + /** Operation-log watermark for the materialized final state. */ + oplog_watermark?: RunOplogWatermark; /** * Live execution status. Only present for Dashboard-launched runs that are * still being tracked in-memory — used to render a spinner in RunList @@ -44,6 +48,17 @@ export interface RunMeta { status?: 'starting' | 'running' | 'finished' | 'failed'; } +export interface RunOplogWatermark { + ref: string; + operation_id?: string; + updated_at?: string; +} + +export interface RunFinalState { + lifecycle: 'active' | 'hidden' | 'deleted'; + tags: string[]; +} + export interface RunListResponse { runs: RunMeta[]; next_cursor?: string; @@ -120,6 +135,103 @@ export interface SourceTraceability { referenced_files?: SourceReferencedFile[]; } +export interface ExternalTraceMetadata { + /** + * Optional external viewer reference only. AgentV run artifacts remain the + * canonical source of truth for Dashboard trace/session details. + */ + provider?: string; + project?: string; + session_id?: string; + trace_id?: string; + url?: string; +} + +export interface TraceSessionTokenUsage { + input?: number; + output?: number; + reasoning?: number; + cached?: number; + total?: number; +} + +export interface TraceSessionSpanStatus { + code?: string; + message?: string; +} + +export type TraceSessionEventKind = 'annotation' | 'exception' | 'event' | 'score'; + +export interface TraceSessionEvent { + event_id: string; + span_id: string; + name: string; + kind: TraceSessionEventKind; + time_unix_nano?: string; + timestamp?: string; + score?: number; + text?: string; + passed?: boolean; + attributes?: Record; +} + +export interface TraceSessionSpan { + id: string; + trace_id?: string; + span_id: string; + parent_span_id?: string | null; + name: string; + kind?: string; + status?: TraceSessionSpanStatus; + start_time_unix_nano?: string; + end_time_unix_nano?: string; + start_time?: string; + end_time?: string; + duration_ms?: number; + token_usage?: TraceSessionTokenUsage; + attributes?: Record; + events?: TraceSessionEvent[]; +} + +export interface TraceSessionScore { + name: string; + type?: string; + score: number; + weight?: number; + verdict?: string; + source?: string; + evaluated_at?: string; + target_span_id?: string; + evidence?: Record; +} + +export interface TraceSessionSource { + kind?: string; + path?: string; + provider?: string; + format?: string; + version?: string; + artifact_path?: string; + metadata?: Record; +} + +export interface TraceSessionResponse { + schema_version: 'agentv.dashboard.trace_session.v1'; + artifact_id?: string; + created_at?: string; + run_id?: string; + test_id?: string; + suite?: string; + target?: string; + trace_id?: string; + root_span_id?: string; + source?: TraceSessionSource; + external_trace?: ExternalTraceMetadata; + spans: TraceSessionSpan[]; + events: TraceSessionEvent[]; + scores?: TraceSessionScore[]; +} + export interface EvalResult { testId: string; timestamp?: string; @@ -149,6 +261,8 @@ export interface RunDetailResponse { results: EvalResult[]; source: 'local' | 'remote'; source_label?: string; + final_state?: RunFinalState; + oplog_watermark?: RunOplogWatermark; /** Live execution status when this run is still tracked in-memory by Dashboard. */ status?: 'starting' | 'running' | 'finished' | 'failed'; /** Path to the run workspace directory (relative to cwd when inside, otherwise absolute). Local runs only. */ @@ -176,6 +290,19 @@ export interface EvalDetailResponse { eval: EvalResult; } +export type TranscriptArtifactStatus = 'ok' | 'missing' | 'dangling' | 'unsupported'; + +export interface TranscriptArtifactResponse { + status: TranscriptArtifactStatus; + transcript_path?: string; + answer_path?: string; + answer_content?: string; + content?: string; + language?: string; + message?: string; + pointer?: string; +} + export interface IndexEntry { run_filename: string; display_name?: string; @@ -260,6 +387,8 @@ export interface CompareRunEntry { remote_tags?: string[]; pending_tags?: string[]; metadata_dirty?: boolean; + final_state?: RunFinalState; + oplog_watermark?: RunOplogWatermark; source: 'local' | 'remote'; eval_count: number; quality_count?: number; @@ -283,6 +412,8 @@ export interface RunTagsResponse { remote_tags?: string[]; pending_tags?: string[]; metadata_dirty?: boolean; + final_state?: RunFinalState; + oplog_watermark?: RunOplogWatermark; updated_at: string; } diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx index e3fc7803e..ae18284b7 100644 --- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx +++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx @@ -439,22 +439,37 @@ Each result row's `artifact_dir` can include both `outputs/trace.json` and artifact; use each index row's `transcript_path` to find the per-result transcript. +Rows also include `artifact_pointers` for AgentV-owned artifact storage. Pointer +entries such as `artifact_pointers.trace` and `artifact_pointers.transcript` +carry the storage `ref`, artifact `key`, canonical run-relative `path`, +`object_version`, `sha256`, `size`, `schema_version`, and `media_type` so +viewers and exports can migrate from git refs to object storage without changing +the run record contract. + `outputs/trace.json` is the full-fidelity `agentv.trace.v1` sidecar. It stores the canonical span graph, source metadata, capture/redaction policy, conversion warnings, score provenance, and opaque evidence references. -`outputs/transcript.jsonl` is a derived compatibility artifact for reading and -replay. It uses provider-neutral `agentv.transcript.v1` rows with stable -top-level fields for message order, role/content, tool calls and paired results, -timing, token usage, cost, source metadata, capture state, and trace pointers. +`outputs/transcript.jsonl` is the canonical AgentV transcript/timeline artifact. +It uses provider-neutral `agentv.transcript.v1` rows with stable top-level fields +for message order, role/content, tool calls and paired results, timing, token +usage, cost, source metadata, capture state, and trace pointers. Provider-native payloads can appear only inside opaque nested fields such as `metadata`, `source.metadata`, tool `input`, or tool `output`. +When an agent provider captures a native stream or session log, the result row +may also include `raw_provider_log_path`, pointing at +`outputs/raw/provider.log`. That file is raw evidence copied byte-for-byte from +the provider log and is not parsed, normalized, or required for replay, import, +Agent Skills conversion, or grading. AgentV does not write or maintain a +parallel `outputs/transcript.json` source of truth. + Use the transcript when you need a compact portable message/event projection over the trace, including exports to role/content arrays for chat-template or Hugging Face-style workflows. Use the trace when you need full lifecycle, span, -raw evidence, redaction, or adapter conversion details. The transcript is not a -second canonical trace source and is not a provider-native Pi session dump. +raw evidence pointers, redaction, or adapter conversion details. The transcript +is not a second canonical trace source and is not a provider-native Pi session +dump. Older transcript rows without `schema_version`, `capture`, or `trace` remain accepted for replay. diff --git a/apps/web/src/content/docs/docs/tools/dashboard.mdx b/apps/web/src/content/docs/docs/tools/dashboard.mdx index fadec9963..fd5f9eced 100644 --- a/apps/web/src/content/docs/docs/tools/dashboard.mdx +++ b/apps/web/src/content/docs/docs/tools/dashboard.mdx @@ -146,7 +146,7 @@ Select 2+ rows with the checkboxes and click the sticky **Compare N** action to ### Retroactive tags -Click any row's **Tags** cell to tag a run after the fact. Each run can carry multiple free-form tags (max 20, up to 60 characters each); local tags are stored in a `tags.json` sidecar next to `index.jsonl` in the run workspace, so they're mutable, non-destructive, and won't touch your eval YAML or run manifest. The chip editor supports Enter/comma to commit a new tag, Backspace to remove the last chip, and **Clear all** to remove every tag (deletes the sidecar). +Click any row's **Tags** cell to tag a run after the fact. Each run can carry multiple free-form tags (max 20, up to 60 characters each); local tags are stored in a `tags.json` sidecar next to `index.jsonl` in the run workspace, so they're mutable, non-destructive, and won't touch your eval YAML or run manifest. The chip editor supports Enter/comma to commit a new tag, Backspace to remove the last chip, and **Clear all** to record an empty tag state with an operation watermark. Remote run payloads stay immutable, but their tags are editable. Dashboard writes remote tag changes as metadata overlays under `.agentv/results/metadata/runs/.../tags.json` in the configured results repo clone. Until those overlays are synced, the run and project show a dirty state; **Sync Project** commits and pushes them when it is safe to do so. diff --git a/apps/web/src/content/docs/docs/tools/import.mdx b/apps/web/src/content/docs/docs/tools/import.mdx index 54794e6ec..86ff592aa 100644 --- a/apps/web/src/content/docs/docs/tools/import.mdx +++ b/apps/web/src/content/docs/docs/tools/import.mdx @@ -176,7 +176,10 @@ Rows without `schema_version`, `capture`, or `trace` from older AgentV transcrip exports remain replayable. New eval run artifacts write the v1 shape. For eval run artifacts, `outputs/transcript.jsonl` is derived from `outputs/trace.json`; it is a portable message/event projection, not a second -canonical trace source or a provider-native session dump. +canonical trace source or a provider-native session dump. Provider-native +session or stream logs, when captured during an eval run, are separate raw +evidence artifacts referenced by `raw_provider_log_path`; Agent Skills import, +convert, transpile, and run paths do not require them. ## What Gets Parsed diff --git a/apps/web/src/content/docs/docs/tools/results.mdx b/apps/web/src/content/docs/docs/tools/results.mdx index 4837595aa..add4afdfc 100644 --- a/apps/web/src/content/docs/docs/tools/results.mdx +++ b/apps/web/src/content/docs/docs/tools/results.mdx @@ -108,6 +108,50 @@ Duplicate policy is explicit: `attempt` defaults to `0`, `variant` defaults to `null`, and `source_target` defaults to `target` when a run has no replay source. Replay and rerun sources can set `source_target`, `attempt`, or `variant`; those values are part of the identity, so different attempts, variants, or source targets produce distinct projection IDs. +### Vendor-neutral projection bundle + +Use the additive projection bundle path when an external adapter needs a +backend-neutral handoff instead of AgentV's full artifact tree: + +```bash +agentv results export --projection-bundle +``` + +This writes `projection_bundle.json` next to the exported artifacts. The bundle +contains stable projection IDs, trace envelope metadata, OpenInference-shaped +span references, score provenance, artifact-relative paths, capture/redaction +summary, and conversion warnings. It does not call Phoenix, Opik, Braintrust, +Langfuse, Hugging Face, or any other live service. + +For adapter development and CI snapshots, use dry-run mode: + +```bash +agentv results export --dry-run > projection_bundle.json +``` + +Dry-run prints deterministic JSON and does not write export artifacts. Vendor +adapters should consume either this JSON directly or the local +`projection_bundle.json`. Dry-run refs are marked +`artifact_refs.status: "planned_export"` because the export tree has not been +written. Bundles written with `--projection-bundle` are built from the emitted +export `index.jsonl` and use `artifact_refs.status: "emitted"`. + +Raw prompt text, final output, and tool arguments/results are excluded by +default, and raw-bearing artifact refs such as `grading_path`, `input_path`, +`answer_path`, `response_path`, `transcript_path`, and `trace_path` are omitted from +metadata-only bundles. To include raw payloads and raw-bearing refs in the +bundle, opt in explicitly: + +```bash +agentv results export --dry-run --include-raw-content +``` + +Keep backend-specific anonymization in the adapter layer. For example, an Opik +adapter can read the metadata-only bundle by default, or require +`--include-raw-content` and then run Opik anonymizers before upload. AgentV does +not run a custom redaction engine in `results export`; it records the capture +policy so downstream processing is auditable. + ## Inspection helpers For lightweight terminal workflows: @@ -127,7 +171,7 @@ The CLI contract is deliberately narrow: `agentv results` manages local result a Use these supported remote workflows instead: -- **Automatic publishing:** configure `projects[].results` or top-level `results`; new `agentv eval` and `agentv pipeline bench` runs publish completed artifacts after the run completes. Use `repo_path: .` with `branch: agentv/results/v1` to store results on a dedicated branch of the source repo. `repo_path` means an existing local Git checkout, distinct from `workspace.repos[].repo`, which is a portable repository identity. Set `sync.auto_push: true` to push after publish, or `sync.require_push: true` in CI to fail when that push fails. While an eval is still running, [WIP checkpoints](/docs/tools/wip-checkpoints/) can keep partial run output durable on `agentv/wip/...` branches when auto-push is enabled. +- **Automatic publishing:** configure `projects[].results` or top-level `results`; new `agentv eval` and `agentv pipeline bench` runs publish completed artifacts after the run completes. Use `repo_path: .` with `branch: agentv/results/v1` to store primary result records on a dedicated branch of the source repo. AgentV reserves `agentv/results/v1` for primary results, `agentv/artifacts/v1` for heavy artifact payloads, and `agentv/oplog/v1` for mutable run/result operations. `repo_path` means an existing local Git checkout, distinct from `workspace.repos[].repo`, which is a portable repository identity. Set `sync.auto_push: true` to push after publish, or `sync.require_push: true` in CI to fail when that push fails. While an eval is still running, [WIP checkpoints](/docs/tools/wip-checkpoints/) can keep partial run output durable on `agentv/wip/...` branches when auto-push is enabled. - **Manual Dashboard sync:** run `agentv dashboard`, open the project, and use **Sync Project**. - **Manual API sync:** while Dashboard is running, call `GET /api/projects/:projectId/remote/status` or `POST /api/projects/:projectId/remote/sync` for project-scoped automation. Single-project sessions also expose `GET /api/remote/status` and `POST /api/remote/sync`. - **Git escape hatch:** for advanced recovery, inspect or repair the configured `projects[].results.path` clone with `git` directly, then sync again. diff --git a/docs/dogfood-reports/2026-06-21-dogfood-integration-av-vwa-16-10-dogfood.md b/docs/dogfood-reports/2026-06-21-dogfood-integration-av-vwa-16-10-dogfood.md new file mode 100644 index 000000000..04b66ff1a --- /dev/null +++ b/docs/dogfood-reports/2026-06-21-dogfood-integration-av-vwa-16-10-dogfood.md @@ -0,0 +1,175 @@ +# Dogfood Report - dogfood-integration-av-vwa-16-10 + +> Diff-scoped CLI and Dashboard QA of `dogfood-integration-av-vwa-16-10` vs `origin/main`. Generated by `/ce-dogfood-beta` on 2026-06-21. + +## Diff Summary + +- Adds canonical AgentV artifact refs and pointer shapes for result rows, trace sidecars, transcript projections, raw provider logs, and projection bundles. +- Writes provider-neutral transcript artifacts at `outputs/transcript.jsonl` while keeping raw provider logs separate at `outputs/raw/provider.log`. +- Adds results combine/export/projection behavior that preserves or rewrites artifact pointers and keeps default exports metadata-oriented. +- Adds oplog-shaped run tag state, tag clear tombstones, and Dashboard/API fields for final run state and watermarks. +- Updates Dashboard API and UI so run lists/details stay metadata-oriented while the Transcript tab lazily loads canonical transcript content and handles missing/dangling/unsupported states. +- Adds a Dashboard trace read model that preserves problematic span graphs with diagnostics and sanitizes external trace or credential-like attributes. + +## Personas + +Source: `STRATEGY.md` "Who it's for". + +- **AI platform engineers and agent builders** - evaluate real agent workflows, compare targets, gate changes, and inspect portable run artifacts from the same workspace their teams already use. + +## Flows Tested + +### Flow A - Canonical Result Artifact Emission + +```mermaid +flowchart TD + A[Eval result is written] --> B[Per-test artifact directory is created] + B --> C[Trace envelope written to outputs/trace.json] + C --> D{Trace has transcript rows?} + D -->|Yes| E[Canonical transcript JSONL written to outputs/transcript.jsonl] + D -->|No| F[Transcript path and pointer are omitted] + E --> G{Raw provider log present?} + F --> G + G -->|Yes| H[Raw log copied to outputs/raw/provider.log] + G -->|No| I[Index row excludes raw_provider_log_path] + H --> J[Index row records raw_provider_log_path separately] + I --> K[Index row emits artifact_pointers with agentv/artifacts/v1] + J --> K + K --> L[Consumers parse snake_case row and reject new camelCase artifactPointers] +``` + +### Flow B - Dashboard Metadata and Lazy Transcript Loading + +```mermaid +flowchart TD + A[User opens Dashboard run list] --> B[API loads lightweight run metadata] + B --> C[User opens a run detail] + C --> D[API hydrates detail without transcript bodies] + D --> E[User selects an eval] + E --> F[Checks tab shows metadata and grader state] + F --> G[User opens Transcript tab] + G --> H{Canonical transcript pointer resolves?} + H -->|Yes| I[Transcript endpoint reads outputs/transcript.jsonl lazily] + H -->|Missing| J[No structured transcript state] + H -->|Dangling| K[Unavailable artifact state with path] + H -->|Unsupported| L[Unsupported pointer state with pointer details] + I --> M[Timeline renders transcript and raw/download links] +``` + +### Flow C - Combine Run Artifact Pointer Rewriting + +```mermaid +flowchart TD + A[User selects two or more run workspaces] --> B[Combine reads each index.jsonl] + B --> C{Duplicate test_id and target rows?} + C -->|Error policy| D[Conflict is reported] + C -->|Latest or explicit choice| E[Selected rows are kept] + C -->|No duplicates| E + E --> F[Referenced artifacts are copied under sources/source-N] + F --> G[Trace artifact pointers are rewritten] + G --> H[Transcript artifact pointers are rewritten] + H --> I[Combined index.jsonl points only at copied files] +``` + +### Flow D - Tags and Oplog Watermarks + +```mermaid +flowchart TD + A[Run metadata is read] --> B[Tag sidecar or remote state is materialized] + B --> C[User sets tags] + C --> D[run.tags.set operation watermark is written] + D --> E[Run list/detail exposes final_state and oplog_watermark] + E --> F[User clears tags] + F --> G[Empty tag tombstone is written] + G --> H[final_state.tags is empty and clear watermark is preserved] +``` + +### Flow E - Trace Read Model Hardening + +```mermaid +flowchart TD + A[Dashboard reads trace envelope] --> B[Project spans, events, scores, and external trace metadata] + B --> C[Credential-like and unsafe external attributes are removed] + C --> D{Span graph shape} + D -->|Duplicate span ids| E[Preserve nodes with collision-free ids and diagnostics] + D -->|Missing parents| F[Promote span to diagnostic root] + D -->|Self-parent or cycle| G[Promote cyclic spans to diagnostic roots] + E --> H[Stable tree is rendered] + F --> H + G --> H +``` + +### Flow F - Projection Bundle Export + +```mermaid +flowchart TD + A[User requests projection bundle or dry run] --> B[Completed run manifest is read] + B --> C{Raw content opted in?} + C -->|No default| D[Bundle records metadata-only capture policy] + D --> E[artifact_refs are planned_export and omit raw-bearing paths] + E --> F[Trace envelopes omit raw evidence and transcript metadata payloads] + C -->|Yes| G[Bundle includes raw content and emitted artifact refs] + G --> H[Adapters receive explicit full-content payload] +``` + +## Test Matrix & Results + +| # | Flow | Journey / Scenario | Status | Issue | Fix | Commit | +|---|------|--------------------|--------|-------|-----|--------| +| 1 | A | Artifact writer emits `outputs/transcript.jsonl`, canonical `artifact_pointers.transcript.ref=agentv/artifacts/v1`, and canonical trace pointer refs. | Pass | Verified by artifact-writer regression tests. | - | - | +| 2 | A | Raw provider log is copied to `outputs/raw/provider.log`, remains separate from canonical transcript rows, and parsed result rows do not treat it as a fresh source log. | Pass | Verified by artifact-writer and orchestrator tests. | - | - | +| 3 | A | New invalid camelCase `artifactPointers` rows are rejected while historical result-row aliases still normalize at the boundary. | Pass | Verified by parser/shared results tests. | - | - | +| 4 | C | Combining runs copies pointed trace/transcript files and rewrites pointer paths/keys to `sources/source-N/...`. | Pass | Verified by combine tests. | - | - | +| 5 | D | Local tag set and tag clear/tombstone operations preserve `final_state` and a fresh `oplog_watermark`. | Pass | Verified by tests and live API set/clear/readback against the fixture server. | - | - | +| 6 | B | Run list, run detail, compare, and index API routes stay metadata-oriented and do not read transcript bodies. | Pass | Verified by serve tests and live API detail payload without transcript body content. | - | - | +| 7 | B | Transcript endpoint returns lazy `ok`, `missing`, `dangling`, and pointer-shaped transcript states from canonical transcript pointers. | Pass | Verified by serve tests, live API calls, and browser Transcript tab states. | - | - | +| 8 | E | Trace read model handles duplicate spans, missing parents, self-parent/cycles, and sanitizes external/credential-like attributes. | Pass | Verified by Dashboard trace read-model tests. | - | - | +| 9 | F | Projection bundle dry-run/default export marks planned refs correctly and excludes raw-bearing payloads by default. | Fixed | Live dry run crashed when a hydrated grader score omitted `assertions`. | Added missing-array fallbacks in result index and trace envelope score serialization, plus regression coverage. | b25b0475 | +| 10 | B | Browser UAT: Dashboard run list/detail remains usable, Transcript tab lazy-loads canonical content, and console errors are absent. | Pass | Agent-browser verified run list/detail, canonical/missing/dangling/unsupported Transcript tab states, lazy request logs, and no page errors. | - | - | + +## What Was Fixed + +### Projection bundle dry run crashed on grader scores without assertions - `b25b0475` + +- **Symptom:** `agentv results export --projection-bundle --dry-run` crashed with `undefined is not an object (evaluating 'score.assertions.map')` when a hydrated grading artifact had a grader score without an `assertions` array. +- **Root cause:** `packages/core/src/evaluation/run-artifacts.ts` and `packages/core/src/evaluation/trace-envelope.ts` assumed every `GraderResult` carried `assertions`, but historical or hand-authored grading artifacts can omit that optional array. +- **Fix:** Normalize missing score assertions to an empty array in index-row score serialization and trace-envelope score evidence serialization. +- **Regression test:** `apps/cli/test/commands/results/export.test.ts` now builds a projection bundle from a grader score that omits `assertions`. + +## Console Errors + +None observed through `agent-browser errors` after canonical, missing, dangling, and unsupported Transcript tab checks. `agent-browser console` was also empty on the canonical transcript path. + +Expected test-suite stderr included git fallback warnings for intentionally invalid remote fixtures; the suite passed. + +## Evidence + +- Diff analyzed with `git diff --name-only origin/main...HEAD` and focused code reads across result writing, combine/export, serve, Dashboard detail/API, and trace read model paths. +- Built core with `bun --filter @agentv/core build`. +- Built Dashboard with `bun run build` from `apps/dashboard/`; Vite emitted only the existing large-chunk warning. +- Ran focused regression suite after the fix: `333 pass`, `0 fail`, `1372 expect() calls`, across 10 files. +- Live Dashboard/results server started from source against a local fixture project on port 3217. +- Live API checks covered run list/detail, transcript `ok`/`missing`/`dangling`/`unsupported`, tag set/clear/readback, and projection dry run. +- Browser UAT used `agent-browser` with a local fixture project. Screenshots were captured outside the public repo as `transcript-tab.png` and `transcript-unsupported.png`. + +## Human Verifications + +Not applicable. The proof used local fixtures and CLI/Dashboard APIs only; no OAuth, email, payment, SMS, or external provider leg was required. + +## Decisions for a Human + +None. + +## Learnings + +- Projection/export code must tolerate historical or hand-authored grader score records that omit optional arrays. Treat missing optional evidence as empty evidence rather than crashing export. +- The lazy transcript boundary is doing useful work: list/detail payloads remain small and metadata-oriented, while transcript body content is fetched only after the user opens the Transcript tab. +- Raw provider logs stay safe as separate evidence under `outputs/raw/provider.log`; they are not canonical transcripts and should not be reinterpreted as source logs on parsed result rows. + +## Final Status + +Pass after fix. The integrated results/artifacts/transcript stack is ready for review from this dogfood pass. + +Functional failure fixed locally: `b25b0475`. + +Human-decision blockers: none. diff --git a/docs/plans/2026-06-21-001-feat-av-quf-results-storage-plan.md b/docs/plans/2026-06-21-001-feat-av-quf-results-storage-plan.md new file mode 100644 index 000000000..e5dcf346b --- /dev/null +++ b/docs/plans/2026-06-21-001-feat-av-quf-results-storage-plan.md @@ -0,0 +1,888 @@ +--- +title: "feat: Specify git-native results storage, retention, and oplog" +type: feat +date: 2026-06-21 +bead: av-quf +base: av-vwa.16.10 +--- + +# feat: Specify git-native results storage, retention, and oplog + +## Summary + +Bead `av-quf` should turn the current git-backed results implementation into a +documented storage contract with three backend modes, a single results branch, +one artifact sidecar namespace, retention and compaction rules, a compact +publication export, an append-only mutable-operation log, and an S3-compatible +object-storage tier. + +The canonical AgentV run artifacts stay `benchmark.json`, `index.jsonl`, per-test +grading/timing files, `outputs/trace.json`, and derived transcript artifacts. GitHub, +Backblaze B2, Phoenix, Hugging Face, and Dashboard are projections, viewers, or storage +backends over those artifacts. + +--- + +## Problem Frame + +`packages/core/src/evaluation/results-repo.ts` already implements the first git-native +slice: `agentv/results/v1` is the default results branch, `runs/**` is listed with +`git ls-tree`, `benchmark.json` blobs are read with `git cat-file --batch`, and the +branch root is a deterministic orphan genesis. Current mutable tags live under +`metadata/runs/**`, and heavy transcript sidecars are still written inside each run +workspace by `packages/core/src/evaluation/run-artifacts.ts`. + +The next implementation beads need a precise shared contract before they split work. +The contract must avoid branch proliferation, keep AgentV artifacts canonical, and +define how git, object storage, retention, publication, and mutable operations compose +without creating another hosted results platform inside AgentV. + +--- + +## Scope Boundaries + +### In Scope + +- Define storage backend modes and per-mode listing/index strategies. +- Pin the git-native ref and path layout for `agentv/results/v1`, + `agentv/artifacts/v1`, and `agentv/oplog/v1`. +- Define retention, compaction, and migration rules for run metadata and heavy artifacts. +- Define compact publication export as a derived artifact over `benchmark.json` and + `index.jsonl`, with no required `eval.txt`. +- Define the mutable operation log and add-wins tag set semantics. +- Define the Backblaze B2 S3-compatible object tier and secret-loading boundary. +- Name concrete files, functions, and tests for dependent implementation beads. + +### Out of Scope + +- Implementing storage backends, S3, oplog, retention, or export code in this bead. +- Adding GitHub issues or tracker runtime state. +- Creating windowed branches, per-run branches, or a hosted Dashboard replacement. +- Making Phoenix, Hugging Face, B2, or GitHub the canonical results model. + +### Deferred to Follow-Up Work + +- Path sharding under `runs/` or artifact prefixes. Only add it after a benchmark with + realistic run counts proves `git ls-tree` or object-store listing is too slow. +- PR-based publishing for human-reviewed result repositories. Machine-generated eval + results should keep direct append commits until a concrete workflow needs review gates. +- A generic non-B2 object-store provider matrix. Start with S3-compatible configuration + narrow enough to support B2 and avoid provider-specific APIs. + +--- + +## Requirements + +### Storage Modes + +- R1. The default mode remains git-native and must work with the current explicit + `repo_path` or `repo_url` results configuration. +- R2. Hybrid mode must keep the run index and metadata in git while moving selected + heavy artifact payloads to object storage. +- R3. Blob-native mode must store run index, metadata, artifacts, and oplog in object + storage without requiring a git checkout, git object database, or git remote. +- R4. Each mode must define its listing/index strategy: git tree listing for git-backed + modes and bucket manifest plus `ListObjectsV2` fallback for blob-native mode. + +### Git Layout And Sync + +- R5. The primary results ref is `agentv/results/v1`. +- R6. Heavy artifact sidecars use the single artifact ref or namespace + `agentv/artifacts/v1`, with path prefixes such as `transcripts/`, `raw-logs/`, + and `screenshots/`. +- R7. Mutable operations use the single oplog ref or namespace `agentv/oplog/v1`. +- R8. The git-native branch must keep deterministic orphan genesis and must not create + windowed branches or per-run branches. +- R9. Path sharding is not part of v1 unless measurement at realistic scale proves it is + needed. + +### Retention And Publication + +- R10. Retention must distinguish logical pruning from history compaction in git-backed + modes. +- R11. Hybrid and blob-native modes must support object lifecycle policy alignment for + artifact payloads without deleting index metadata prematurely. +- R12. Transcript migration must support transcripts under + `agentv/artifacts/v1` while preserving existing logical artifact references. +- R13. Publication export must be compact and derived from `benchmark.json` plus + `index.jsonl`; it must not require an authored or generated `eval.txt`. + +### Mutable Operations + +- R14. Mutable run/result operations must be append-only per actor first. +- R15. Tag mutation semantics start as an add-wins tag set, not direct mutation of + immutable run artifacts. +- R16. Oplog storage location must be defined for all three modes. + +### Object Storage + +- R17. The object-storage tier targets Backblaze B2 through its S3-compatible API. +- R18. The implementation must use a standard S3 SDK/client, not B2-native APIs. +- R19. Credentials must come from environment or config populated by the BWS CLI, and + resolved secret values must not be written into AgentV artifacts, config examples, or + committed docs. + +--- + +## Key Technical Decisions + +- KTD1. Backend mode is a storage concern, not a product model. Use `git-native`, + `hybrid`, and `blob-native` as storage modes while keeping `benchmark.json` and + `index.jsonl` as the artifact contract that readers consume. +- KTD2. Do not overload the existing `results.mode: github` field. Add + `results.storage_mode` with values `git-native`, `hybrid`, and `blob-native`, and + normalize missing `storage_mode` to `git-native`. Put object-store settings under + `results.object_store`. +- KTD3. The git tree remains the index for git-backed modes. `listGitRuns()` should + continue to list `runs/**/benchmark.json` from `agentv/results/v1`; no separate + branch-local `index/runs.jsonl` is introduced. +- KTD4. Use one artifact sidecar namespace named `artifacts`. Do not introduce + `artifact-blobs`, `blobs`, or per-artifact refs. Prefix by artifact class, for example + `transcripts//...`, `raw-logs//...`, and + `screenshots//...`. +- KTD5. Use sibling Git refs for results, artifacts, and oplog. Git refs are stored + path-like, so `agentv/results/v1` cannot coexist with child refs such as + `agentv/results/v1/artifacts` or `agentv/results/v1/oplog`. +- KTD6. Hybrid mode keeps git as the metadata and index authority, while object storage + stores selected heavy payload bytes. Git contains stable artifact locator records with + checksums, sizes, and logical paths so readers can verify fetched payloads. +- KTD7. Blob-native mode mirrors the same logical namespaces in the bucket, but does not + emulate git refs. It owns bucket manifests and per-prefix object listings. +- KTD8. Mutable operations are derived overlays. Existing `metadata/runs/**/tags.json` + is a compatibility read/write surface until oplog materialization replaces direct + overlay writes. +- KTD9. Publication export is a projection. It should read completed run bundles and + emit a compact publishable directory without becoming a new source of truth. +- KTD10. Backblaze B2 is addressed only through S3-compatible endpoints and Signature V4. + The object client should be a standard S3 client configured with endpoint, region, + bucket, and credentials. + +--- + +## High-Level Technical Design + +### Storage Topology + +```mermaid +flowchart TB + Local[Local run workspace .agentv/results/runs] --> Publish[Result publisher] + Publish --> GitIndex[agentv/results/v1 runs metadata] + Publish --> GitArtifacts[agentv/artifacts/v1 artifact sidecar] + Publish --> Oplog[agentv/oplog/v1 mutable ops] + Publish --> Bucket[(B2 S3-compatible bucket)] + + GitIndex --> Dashboard[Dashboard and CLI readers] + GitArtifacts --> Resolver[Artifact resolver] + Bucket --> Resolver + Oplog --> Tags[Derived tag set and future mutable views] + Resolver --> Dashboard + + GitIndex -. hybrid metadata .-> Bucket + Bucket -. blob-native manifest .-> Dashboard +``` + +### Mode Matrix + +| Mode | Canonical index/listing | Artifact payloads | Mutable ops | Git dependency | +| --- | --- | --- | --- | --- | +| `git-native` | `git ls-tree -r agentv/results/v1 -- runs/` plus `git cat-file --batch` for `benchmark.json` | `agentv/artifacts/v1` stores payload bytes | `agentv/oplog/v1` | Required | +| `hybrid` | Same primary git ref as `git-native` | Object storage stores selected payload bytes; git stores locators under the artifact namespace | `agentv/oplog/v1` | Required for index/oplog | +| `blob-native` | Bucket manifest under the results namespace, with `ListObjectsV2` fallback by prefix | Object storage stores all payloads | Bucket oplog prefix | None | + +### Logical Namespace Shape + +```text +agentv/results/v1 + runs///benchmark.json + runs///index.jsonl + runs/// + metadata/runs///materialized-tags.json + +agentv/artifacts/v1 + transcripts////transcript.jsonl + raw-logs////.jsonl + screenshots////.png + +agentv/oplog/v1 + actors//-.json +``` + +For blob-native mode, these are bucket prefixes rather than git refs. The prefix shape +should stay recognizable so readers can share resolver logic across modes. + +--- + +## Section Specs + +### 1. Storage Backend Abstraction And Modes + +**Decision:** Add a narrow storage abstraction around listing, publishing, +materializing artifacts, resolving artifact bytes, syncing, applying retention, and +reading oplog entries. Keep existing git helpers as the first adapter rather than +rewriting all results code at once. + +**File-level plan:** + +- `packages/core/src/evaluation/results-repo.ts` + - Keep `DEFAULT_RESULTS_BRANCH`, deterministic genesis, `listGitRuns()`, + `materializeGitRun()`, and `directPushResults()` as the git adapter's core. + - Extract or wrap adapter-facing functions instead of renaming them in the first + implementation slice. +- `packages/core/src/evaluation/loaders/config-loader.ts` + - Extend `ResultsConfig` and `parseResultsConfig()` with `storage_mode` and + `object_store`. + - Preserve current `repo_url`, `repo_path`, `branch`, `remote`, `path`, and + `sync` behavior for `git-native`. +- `packages/core/src/projects.ts` + - Add matching project-registry YAML and internal fields if Dashboard project + bindings can configure hybrid/blob-native storage. +- New core files, names to finalize during implementation: + - `packages/core/src/evaluation/results-storage.ts` for shared interfaces. + - `packages/core/src/evaluation/results-git-storage.ts` for the git adapter if + extraction from `results-repo.ts` becomes large. + - `packages/core/src/evaluation/results-object-storage.ts` for S3-compatible + primitives. +- `apps/cli/src/commands/results/remote.ts` + - Route `listMergedResultFiles()`, `getRemoteResultsStatus()`, + `ensureRemoteRunAvailable()`, and `maybeAutoExportRunArtifacts()` through the + normalized adapter. +- `apps/cli/src/commands/results/serve.ts` + - Route remote run listing, file reads, and tag mutations through storage-resolved + metadata rather than assuming a git materialized path exists. + +**Per-mode listing/index strategy:** + +- `git-native`: list `runs/**/benchmark.json` with `git ls-tree`; batch-read + benchmark blobs with `git cat-file --batch`; materialize run details lazily with + `materializeGitRun()`. +- `hybrid`: list from the same git ref and read the same `benchmark.json` blobs. + Artifact locators in `index.jsonl` or sidecar manifests decide whether bytes come + from git artifacts or object storage. +- `blob-native`: read a compact run manifest from bucket storage first. If the + manifest is missing or stale, fall back to `ListObjectsV2` over + `runs/**/benchmark.json`-equivalent objects, rebuild the manifest, and continue. + Use continuation tokens because S3 listing returns a bounded page per request. + +**Test plan:** + +- `packages/core/test/evaluation/results-storage.test.ts` + - Normalizes missing storage mode to `git-native`. + - Rejects incompatible config combinations, such as `blob-native` with `repo_path` + as a hard dependency. + - Proves the adapter interface can list runs in all modes from fixtures. +- `packages/core/test/evaluation/results-repo.test.ts` + - Existing git-native tests must keep passing. + - Add coverage that `git-native` listing remains one `runs/**/benchmark.json` + tree scan, not a committed index file. +- `apps/cli/test/commands/results/serve.test.ts` + - Dashboard `/api/runs` response shape stays stable across adapter-backed sources. + +**Acceptance:** + +- A dependent implementation bead can add a new storage adapter without changing + Dashboard components. +- Existing `results.repo_path` and `results.repo_url` configs still publish and list + runs as `git-native`. +- Blob-native mode has no code path that shells out to `git`. + +### 2. Git-Native Layout + +**Decision:** Keep one primary results branch, one artifact sidecar ref, and one oplog +ref. Do not add windowed or per-run branches. Do not shard paths before measurement. + +**File-level plan:** + +- `packages/core/src/evaluation/results-repo.ts` + - Keep `DEFAULT_RESULTS_BRANCH = 'agentv/results/v1'`. + - Add constants for the artifact and oplog refs: + `agentv/artifacts/v1` and `agentv/oplog/v1`. + - Add a shared test assertion that all three refs pass `git check-ref-format` + and no ref is a prefix parent or child of another. + - Extend safe-path staging to include only owned top-level paths on each ref. + - Keep `createResultsGenesisCommit()` and `createOrphanResultsBranch()` behavior + for any new git storage refs so independent clients converge on the same root. + - Keep `commitResultsRunWithTemporaryIndex()` for primary run commits. + - Add artifact-ref and oplog-ref commit helpers only if sharing the temporary-index + machinery remains simple. +- `apps/cli/src/commands/results/remote.ts` + - Keep `getResultsStorageRef()` returning the primary ref for run listing. + - Add resolver access to artifact and oplog refs without changing remote run IDs. +- `packages/core/test/evaluation/results-repo.test.ts` + - Add deterministic genesis tests for the artifact and oplog refs if they are + created by separate helper functions. + - Add tests that two clients publishing to `agentv/artifacts/v1` converge + rather than minting divergent orphan roots. + +**Layout rules:** + +- Primary ref `agentv/results/v1`: + - Owns `runs/**` and lightweight materialized metadata. + - Lists runs only through `runs/**/benchmark.json`. +- Artifact ref `agentv/artifacts/v1`: + - Owns payload classes under `transcripts/`, `raw-logs/`, and `screenshots/`. + - May store payload bytes in `git-native`. + - May store locator manifests in `hybrid`. +- Oplog ref `agentv/oplog/v1`: + - Owns append-only operation records under `actors/**`. + - Is never used for immutable run payloads. + +**Test plan:** + +- Unit test constants and normalized default branch. +- Integration test with a temporary repo that publishes: + - one run to `agentv/results/v1`; + - one transcript payload to `agentv/artifacts/v1`; + - one tag operation to `agentv/oplog/v1`. +- Assert all three refs can coexist in one temporary repo because none is a + path-prefix of another. +- Assert the source checkout branch does not switch. +- Assert no `agentv/results/v1/` or `agentv/results/run/` refs are created. + +**Acceptance:** + +- `git for-each-ref refs/heads/agentv/results` shows only the v1 primary ref and the + two named sidecar refs for completed-result storage. +- Run listing performance is measured against realistic data before any path sharding + proposal is accepted. + +### 3. Retention, Compaction, And Transcript Migration + +**Decision:** Retention removes live references first; compaction is an explicit +maintenance action because git history and object-store versioning can keep old bytes +after logical deletion. + +**File-level plan:** + +- New core file, likely `packages/core/src/evaluation/results-retention.ts` + - Evaluate retention policy against normalized run metadata. + - Produce a deletion plan for primary run paths, artifact sidecar paths, oplog + materializations, and object-store payloads. + - Keep policy evaluation pure so git and bucket adapters can execute it. +- `packages/core/src/evaluation/results-repo.ts` + - Add git deletion commits for `runs/**`, `metadata/runs/**`, and artifact-ref + prefixes. + - Add optional compaction helpers only after logical pruning exists. +- `packages/core/src/evaluation/run-artifacts.ts` + - Preserve logical `transcript_path` values while supporting external artifact + locators. + - Add optional artifact locator metadata in `index.jsonl` rather than replacing the + existing path fields. +- `apps/cli/src/commands/results/remote.ts` + - Teach `ensureRemoteRunAvailable()` and future artifact resolvers to fetch a + transcript from `agentv/artifacts/v1` when the run-local path is a logical + reference. +- `apps/cli/src/commands/results/serve.ts` + - Keep file API responses stable for transcript JSONL, whether bytes are local, + materialized from git, or streamed from object storage. + +**Git/hybrid retention rules:** + +- Logical prune commit: + - Removes selected `runs///**` from `agentv/results/v1`. + - Removes selected artifact paths from `agentv/artifacts/v1` or replaces + hybrid locator records with tombstones. + - Appends retention operations to oplog when mutable state is affected. +- Compaction: + - Explicitly rewrites or re-roots storage refs after a backup/export checkpoint. + - Never runs automatically during `agentv eval`. + - Requires remote coordination because old commits and blobs can disappear after + garbage collection. + +**Bucket lifecycle rules:** + +- Hybrid: + - Keep object payloads at least as long as primary git metadata points to them. + - Use object lifecycle for expired payload classes after the git retention plan + removes or tombstones their locators. +- Blob-native: + - Bucket lifecycle can expire artifact payload prefixes independently only when the + bucket manifest and oplog policy mark them expired. + - Keep index manifests longer than payloads when publication or audit needs summary + history without large transcripts. + +**Transcript migration:** + +- Existing runs may have `transcript_path` pointing at + `/outputs/transcript.jsonl`. +- Migration copies transcript bytes to + `agentv/artifacts/v1:transcripts////transcript.jsonl` + or the matching object-store key. +- `index.jsonl` keeps `transcript_path` as the logical path and gains optional locator + metadata with `backend`, `ref` or bucket namespace, `path`, `sha256`, and + `size_bytes`. +- Readers resolve the logical path through locator metadata first and fall back to the + run-local file for historical bundles. + +**Test plan:** + +- `packages/core/test/evaluation/results-retention.test.ts` + - Selects old runs by timestamp and keeps protected latest runs. + - Plans transcript sidecar deletion only after primary metadata no longer points to it. + - Produces separate plans for git-native, hybrid, and blob-native modes. +- `packages/core/test/evaluation/run-artifacts.test.ts` + - Verifies optional artifact locator fields are snake_case and do not break + `parseJsonlResults()`. +- `apps/cli/test/commands/results/serve.test.ts` + - Serves a transcript from sidecar/object locator with the same raw/download + behavior as a run-local transcript. + +**Acceptance:** + +- Retention can remove old live runs without breaking listing for retained runs. +- A transcript migrated under `agentv/artifacts/v1` remains viewable through + the existing Dashboard file API. +- Compaction cannot run implicitly as a side effect of publish, sync, or Dashboard + polling. + +### 4. Compact Derived Publication Export + +**Decision:** Publication output is a derived export over the canonical run bundle. +It does not require an `eval.txt` artifact, and it does not become the source of truth +for rerun, comparison, grading, or adapter ingestion. + +**File-level plan:** + +- `apps/cli/src/commands/results/export.ts` + - Keep the current run-workspace export path aligned with + `writeArtifactsFromResults()`. + - Add or route to a publication export mode only if the CLI surface stays narrow. +- New CLI/core files if a separate command reads cleaner: + - `apps/cli/src/commands/results/publication.ts` + - `packages/core/src/evaluation/results-publication.ts` +- `packages/core/src/evaluation/run-artifacts.ts` + - Remains the source for `benchmark.json`, `index.jsonl`, and per-test artifact + schemas. +- `apps/web/src/content/docs/docs/tools/results.mdx` + - Document that publication export reads completed run artifacts and does not + require `eval.txt`. + +**Publication contract:** + +- Inputs: + - completed run workspace; + - `index.jsonl` manifest; + - `benchmark.json`; + - optional sidecar-resolved artifact references for selected public payloads. +- Outputs: + - compact `benchmark.json` and `index.jsonl` or a derived `publication.json`; + - optional static assets for selected summaries; + - no required `eval.txt`. +- Privacy: + - Default export excludes raw prompts, tool args/results, transcripts, screenshots, + and raw logs unless the user opts into a payload class. + +**Test plan:** + +- `apps/cli/test/commands/results/export.test.ts` + - Publication export succeeds with only `benchmark.json` and `index.jsonl`. + - Publication export fails clearly when the manifest is not an AgentV result row. + - Payload opt-in includes only selected sidecar files. +- `apps/cli/test/commands/results/report.test.ts` + - Existing single-run HTML report remains unaffected. + +**Acceptance:** + +- A publication artifact can be generated from a run bundle that has no `eval.txt`. +- The exported publication states or embeds enough summary data for readers without + replacing the canonical run bundle. +- External viewers consume publication output as a projection, not as an AgentV run + workspace. + +### 5. Mutable Run/Result Operations Via Append-Only Oplog + +**Decision:** Implement mutable operations as per-actor append-only operation records. +Tags are the first materialized view and use add-wins set semantics. + +**File-level plan:** + +- `apps/cli/src/commands/results/remote-metadata.ts` + - Preserve current `metadata/runs/**/tags.json` behavior as a compatibility layer. + - Add read/write paths that append oplog operations before or instead of writing + materialized overlays. +- New core file, likely `packages/core/src/evaluation/results-oplog.ts` + - Define operation wire records with snake_case fields. + - Define actor id, sequence/nonce, operation id, target run id, operation kind, + payload, created timestamp, and optional causal metadata. + - Implement add-wins tag projection. +- `packages/core/src/evaluation/results-repo.ts` + - Add git append helpers for `agentv/oplog/v1`. +- `apps/cli/src/commands/results/serve.ts` + - Route tag set, clear, and read endpoints through oplog projection for remote + runs once the adapter is available. +- `apps/dashboard/src/lib/run-list-actions.ts` and tag-related component tests + - Keep UI semantics stable: tags remain free-form chips with existing limits. + +**Operation shape:** + +```yaml +schema_version: agentv.oplog.v1 +op_id: actor-a/2026-06-21T10-00-00-000Z-01hx +actor_id: actor-a +created_at: "2026-06-21T10:00:00.000Z" +target: + run_id: with-skills::2026-06-17T10-00-00-000Z +kind: tag_add +payload: + tag: release-candidate +``` + +For tag projection, removals record `tag_remove` with the tag value. Concurrent add +and remove resolves to present when the add operation is not causally observed by the +remove. That is the add-wins rule and prevents a stale clear from deleting another +actor's later tag addition. + +**Where oplog lives by mode:** + +- `git-native`: `agentv/oplog/v1` git ref, under + `actors//-.json`. +- `hybrid`: same git oplog ref, because git remains the metadata authority. +- `blob-native`: object-store prefix + `oplog/actors//-.json`, with a bucket manifest + for efficient projection rebuilds. + +**Test plan:** + +- `packages/core/test/evaluation/results-oplog.test.ts` + - Projects add-wins tags from add/remove operations. + - Handles duplicate op ids idempotently. + - Keeps operations from different actors without content conflicts. + - Rejects non-snake_case or malformed operation records. +- `apps/cli/test/commands/results/remote-metadata.test.ts` + - Existing overlay tests keep passing. + - New oplog-backed tag write produces the same returned `RemoteRunTagState`. +- `apps/cli/test/commands/results/serve.test.ts` + - Tag API returns effective tags after concurrent actor operations. + +**Acceptance:** + +- A remote tag edit appends an operation and does not rewrite immutable run artifacts. +- Concurrent tag adds from two actors both appear in the materialized tag set. +- Blob-native tag edits work without git. + +### 6. Object-Storage Tier: Backblaze B2 Through S3-Compatible API + +**Decision:** Use Backblaze B2 only through the S3-compatible API with a standard S3 +client. The B2 Native API is out of scope for this storage tier. + +**File-level plan:** + +- `packages/core/package.json` + - Add `@aws-sdk/client-s3` as a direct dependency if object storage code lands in + core. Do not rely on transitive dependencies from provider packages. +- `packages/core/src/evaluation/results-object-storage.ts` + - Create the S3-compatible client from endpoint, region, bucket, prefix, and + environment-provided credentials. + - Implement `put`, `get`, `head`, `delete`, multipart threshold decisions, and + paginated listing. + - Use `ListObjectsV2` continuation tokens for listing. +- `packages/core/src/evaluation/loaders/config-loader.ts` + - Parse object-store config with snake_case fields: + +```yaml +results: + storage_mode: hybrid + repo_path: . + object_store: + provider: s3-compatible + endpoint: ${AGENTV_RESULTS_S3_ENDPOINT} + region: ${AGENTV_RESULTS_S3_REGION} + bucket: ${AGENTV_RESULTS_S3_BUCKET} + prefix: agentv/results/v1 +``` + +- `packages/core/src/evaluation/hooks.ts` + - Reuse existing `before_session` secret-loading support where possible. A project + can run BWS before AgentV commands and inject `AGENTV_RESULTS_S3_*` variables. +- `apps/web/src/content/docs/docs/tools/dashboard.mdx` and + `apps/web/src/content/docs/docs/tools/results.mdx` + - Document that BWS is a local/CI secret source and resolved values must not be + committed. + +**B2 specifics:** + +- Endpoint format is `https://s3..backblazeb2.com`. +- Authentication uses S3 Signature V4. +- Application key id maps to S3 access key id; application key maps to S3 secret key. +- Configure standard S3 endpoint override, region, and credentials. Do not call B2 + Native API endpoints. + +**BWS secret boundary:** + +- Recommended local/CI flow: + - BWS authenticates with `BWS_ACCESS_TOKEN`. + - BWS injects or exports the S3 endpoint, region, bucket, access key id, and secret + access key into environment variables before AgentV runs. + - AgentV config interpolates variable names or reads environment variables directly. +- Never persist resolved BWS values into `benchmark.json`, `index.jsonl`, oplog records, + Dashboard responses, docs examples, or project registry files. + +**Test plan:** + +- `packages/core/test/evaluation/results-object-storage.test.ts` + - Uses a fake S3 client or local test double to verify `PutObject`, `GetObject`, + `HeadObject`, `DeleteObject`, and paginated `ListObjectsV2` behavior. + - Verifies credentials are read from env and are not serialized into manifests. + - Verifies B2 endpoint config is passed as an S3 endpoint override. +- `packages/core/test/evaluation/loaders/config-loader.test.ts` + - Parses object-store config and rejects missing bucket/endpoint for hybrid or + blob-native modes. +- `apps/cli/test/commands/results/serve.test.ts` + - Streams a sidecar artifact from object storage through the existing file API. + +**Acceptance:** + +- Hybrid mode can write a transcript payload to B2 through the S3-compatible client + while listing the run from git. +- Blob-native mode can list runs from bucket metadata without invoking git. +- No code imports a B2-native SDK or calls B2-native API-specific operations. +- No test fixture or docs example contains resolved secret values. + +--- + +## Implementation Units + +### U1. Results Storage Config And Adapter Boundary + +- **Goal:** Add the storage-mode config and adapter interface that later units can use. +- **Requirements:** R1, R2, R3, R4, R19 +- **Dependencies:** None +- **Files:** `packages/core/src/evaluation/loaders/config-loader.ts`, + `packages/core/src/projects.ts`, `packages/core/src/evaluation/results-storage.ts`, + `packages/core/test/evaluation/loaders/config-loader.test.ts`, + `packages/core/test/projects.test.ts`, + `packages/core/test/evaluation/results-storage.test.ts` +- **Approach:** Introduce storage mode without overloading `results.mode: github`. + Normalize missing `storage_mode` to `git-native`, keep current git fields valid, and + define adapter methods for listing, publishing, materializing, artifact reads, + oplog reads, and retention. +- **Patterns to follow:** `normalizeResultsConfig()` in + `packages/core/src/evaluation/results-repo.ts`; `fromYaml()` and `toYaml()` in + `packages/core/src/projects.ts`; snake_case boundary rules in `.agents/conventions.md`. +- **Test scenarios:** + - Given current `repo_path: .` config with no storage mode, normalization returns + `git-native`. + - Given `storage_mode: hybrid`, parser requires valid git configuration and + `object_store`. + - Given `storage_mode: blob-native`, parser accepts `object_store` without + `repo_path` or `repo_url`. + - Given `blob-native` config with no object store, parser rejects it with a clear + warning. + - Given project registry results config with object-store fields, YAML load/save + preserves snake_case on disk and camelCase internally. + - Given legacy `mode: github`, git-native config still works and does not imply + GitHub-only storage. +- **Verification:** Existing git-native publish/list tests still compile against the + normalized config, and new mode tests do not require real network access. + +### U2. Git Refs, Sidecar Constants, And Artifact Locator Support + +- **Goal:** Pin the three git refs and add resolver support for sidecar artifacts. +- **Requirements:** R5, R6, R7, R8, R9, R12 +- **Dependencies:** U1 +- **Files:** `packages/core/src/evaluation/results-repo.ts`, + `packages/core/src/evaluation/run-artifacts.ts`, + `apps/cli/src/commands/results/remote.ts`, + `apps/cli/src/commands/results/serve.ts`, + `packages/core/test/evaluation/results-repo.test.ts`, + `packages/core/test/evaluation/run-artifacts.test.ts`, + `apps/cli/test/commands/results/serve.test.ts` +- **Approach:** Keep `agentv/results/v1` as the listable run ref. Add named constants + for artifact and oplog refs. Add optional artifact locator metadata while preserving + existing logical path fields such as `transcript_path`. +- **Patterns to follow:** Current deterministic genesis functions in `results-repo.ts`; + `buildIndexArtifactEntry()` and `buildResultIndexArtifact()` in `run-artifacts.ts`; + existing transcript file API tests in `serve.test.ts`. +- **Test scenarios:** + - Given a run with a sidecar transcript locator, Dashboard raw file endpoint returns + the same text/plain response as a local transcript file. + - Given no sidecar locator, historical run-local `transcript_path` still resolves. + - Given two clients create an artifact ref, the genesis commit is deterministic. + - Given a publish, no per-run or windowed result refs are created. +- **Verification:** `listGitRuns()` output is unchanged for runs that do not use sidecar + payloads. + +### U3. Retention And Compaction Planner + +- **Goal:** Add retention planning that can prune runs and sidecars without implicit + history compaction. +- **Requirements:** R10, R11, R12 +- **Dependencies:** U1, U2 +- **Files:** `packages/core/src/evaluation/results-retention.ts`, + `packages/core/src/evaluation/results-repo.ts`, + `packages/core/src/evaluation/results-object-storage.ts`, + `packages/core/test/evaluation/results-retention.test.ts`, + `packages/core/test/evaluation/results-repo.test.ts` +- **Approach:** Build a pure planner first. Execution adapters take the plan and create + git deletion commits or bucket deletion batches. Keep compaction as a separate + explicit operation with stronger confirmation and documentation. +- **Patterns to follow:** Safe path filters in `isSafeResultsRepoPath()` and + `existingTrackedResultsDirs()`; project sync's blocked status reporting. +- **Test scenarios:** + - Given runs older than a retention threshold, planner selects primary run paths and + sidecar paths for deletion. + - Given a sidecar transcript still referenced by a retained run, planner keeps it. + - Given object lifecycle policy shorter than metadata retention, planner reports the + mismatch instead of approving deletion. + - Given compaction is not requested, no history rewrite operation is emitted. +- **Verification:** Retention execution can be tested against a temporary git repo and a + fake object store without touching real remotes. + +### U4. Publication Export Projection + +- **Goal:** Add the compact publication export without requiring `eval.txt`. +- **Requirements:** R13 +- **Dependencies:** U1 +- **Files:** `apps/cli/src/commands/results/export.ts`, + `apps/cli/src/commands/results/index.ts`, + `packages/core/src/evaluation/results-publication.ts`, + `apps/web/src/content/docs/docs/tools/results.mdx`, + `apps/cli/test/commands/results/export.test.ts` +- **Approach:** Keep publication export read-only over completed run artifacts. Use + `parseJsonlResults()` and `benchmark.json` metadata as inputs. If a new command is + clearer than another export option, keep it under `agentv results` but document it as + projection-only. +- **Patterns to follow:** `loadExportSource()` and `deriveOutputDir()` in + `apps/cli/src/commands/results/export.ts`; `results report` docs for static output + framing. +- **Test scenarios:** + - Given a run with `index.jsonl` and `benchmark.json`, publication export succeeds + with no `eval.txt`. + - Given an invalid JSONL input that is not an AgentV result row, publication export + fails with the existing result-row schema guidance. + - Given transcript payloads exist, publication export excludes them by default. + - Given payload opt-in for transcripts, publication export includes only selected + sidecar-resolved transcript files. +- **Verification:** The generated publication output can be inspected from disk and does + not modify the source run workspace. + +### U5. Oplog And Add-Wins Tag Projection + +- **Goal:** Replace direct mutable metadata writes with append-only operations and a tag + projection. +- **Requirements:** R14, R15, R16 +- **Dependencies:** U1, U2 +- **Files:** `packages/core/src/evaluation/results-oplog.ts`, + `packages/core/src/evaluation/results-repo.ts`, + `apps/cli/src/commands/results/remote-metadata.ts`, + `apps/cli/src/commands/results/serve.ts`, + `apps/dashboard/src/lib/run-list-actions.ts`, + `packages/core/test/evaluation/results-oplog.test.ts`, + `apps/cli/test/commands/results/remote-metadata.test.ts`, + `apps/cli/test/commands/results/serve.test.ts`, + `apps/dashboard/src/lib/run-list-actions.test.ts` +- **Approach:** Append `tag_add` and `tag_remove` operations per actor, materialize the + effective tag set for read performance, and keep current Dashboard tag UX stable. +- **Patterns to follow:** Current `RemoteRunTagState` shape and `metadata/runs/**` + overlay path handling in `remote-metadata.ts`. +- **Test scenarios:** + - Given two actors add different tags concurrently, both tags are visible. + - Given one actor clears tags while another later adds a tag, the later add wins. + - Given duplicate operation ids, projection is idempotent. + - Given malformed operation JSON, projection reports a warning and skips the record. + - Given blob-native mode, tag operations are stored under bucket oplog prefix and no + git command runs. +- **Verification:** Dashboard tag endpoints return the same response shape as today. + +### U6. S3-Compatible Object Store And B2 Integration + +- **Goal:** Add the object-store tier used by hybrid and blob-native modes. +- **Requirements:** R2, R3, R4, R11, R17, R18, R19 +- **Dependencies:** U1 +- **Files:** `packages/core/package.json`, `bun.lock`, + `packages/core/src/evaluation/results-object-storage.ts`, + `packages/core/src/evaluation/loaders/config-loader.ts`, + `packages/core/test/evaluation/results-object-storage.test.ts`, + `packages/core/test/evaluation/loaders/config-loader.test.ts`, + `apps/web/src/content/docs/docs/tools/dashboard.mdx`, + `apps/web/src/content/docs/docs/tools/results.mdx` +- **Approach:** Add a standard S3 client wrapper with endpoint override support. Keep + B2-specific knowledge in docs/config examples and endpoint validation, not in a + B2-native SDK layer. +- **Patterns to follow:** Existing env interpolation in config loader; `hooks.before_session` + parsing in `packages/core/src/evaluation/hooks.ts`; secret-redaction posture in + task-bundle tests. +- **Test scenarios:** + - Given B2-style endpoint, region, bucket, and env credentials, object client is + configured as S3-compatible. + - Given a paginated object listing, all pages are read using continuation tokens. + - Given missing credentials, error message names variables but not values. + - Given a sidecar upload, the stored locator includes checksum and size but no secret. + - Given blob-native listing, run manifests load from bucket without git. +- **Verification:** Unit tests use a fake S3 client; no real B2 bucket is needed for CI. + +--- + +## System-Wide Impact + +- **Core:** `results-repo.ts` stops being the only remote-results boundary and becomes + the git adapter or wrapped by one. +- **CLI:** `results export`, auto-publish, and Dashboard server routes need adapter + routing but should preserve existing user-facing response shapes. +- **Dashboard:** The UI should not learn storage-specific concepts. It consumes the same + run list, file, and tag API responses. +- **Docs:** Results and Dashboard docs need updated wording because current docs still + imply only git-backed remote results and mention committed `.agentv/results/**` paths + in places that now flatten on-branch to `runs/**`. +- **Secrets:** Object-store credentials must stay in environment or local secret-loading + flows. The implementation must not serialize them into artifacts or Dashboard JSON. + +--- + +## Risks And Mitigations + +| Risk | Mitigation | +| --- | --- | +| Storage abstraction balloons beyond current need | Keep interface methods tied to existing results operations: list, publish, materialize/read artifact, sync, retention, oplog. | +| Hybrid locators break old readers | Keep existing logical path fields and add optional locator metadata. Old bundles keep local files; new readers prefer locators. | +| Git compaction surprises collaborators | Make compaction explicit and separate from retention. Document backup and remote coordination requirements before implementation. | +| Blob-native listing becomes expensive | Use a bucket manifest as the fast path and `ListObjectsV2` as a rebuild/fallback path. Add sharding only after measurement. | +| Secrets leak through config or artifacts | Use env interpolation and BWS injection only; tests assert secret values are absent from manifests, docs fixtures, and errors. | +| B2 differences from AWS S3 leak into core | Use standard S3 client operations and endpoint override. Keep B2-specific docs limited to endpoint/credential mapping. | + +--- + +## Acceptance Checklist + +- [ ] Spec includes one section each for storage modes, git-native layout, + retention/compaction, publication export, oplog, and object storage. +- [ ] All refs are pinned exactly: `agentv/results/v1`, `agentv/artifacts/v1`, + and `agentv/oplog/v1`. +- [ ] Shared ref tests assert the three refs are valid Git refnames and cannot + prefix-conflict. +- [ ] The artifact sidecar is called `artifacts`, not `artifact-blobs` or `blob`. +- [ ] The plan has no windowed or per-run branches. +- [ ] Path sharding is deferred until realistic measurement proves need. +- [ ] AgentV artifacts remain canonical; Dashboard, Hugging Face, Phoenix, B2, and + GitHub are projections/viewers/storage backends. +- [ ] File/function-level implementation guidance names current result repo, remote, + serve, export, artifact-writer, and Dashboard surfaces. +- [ ] Test plan covers core, CLI, Dashboard, and docs-facing behavior. +- [ ] Dependent beads `av-dcs`, `av-kxa`, `av-8un`, `av-dsc`, and `av-thr` can pick + up scoped implementation units without inventing storage decisions. + +--- + +## Sources And Research + +- `docs/plans/git-native-results.md` for the current git-tree-as-index contract. +- `docs/plans/results-branch-layout.md` for flattened `runs/` and `metadata/runs/` + layout. +- `packages/core/src/evaluation/results-repo.ts` for deterministic genesis, + `directPushResults()`, `listGitRuns()`, and `materializeGitRun()`. +- `packages/core/src/evaluation/run-artifacts.ts` and + `apps/cli/src/commands/eval/artifact-writer.ts` for `benchmark.json`, + `index.jsonl`, `outputs/trace.json`, and transcript sidecars. +- `apps/cli/src/commands/results/remote.ts`, + `apps/cli/src/commands/results/remote-metadata.ts`, + `apps/cli/src/commands/results/serve.ts`, and + `apps/cli/src/commands/results/export.ts` for current CLI/Dashboard remote, + metadata, serving, and export behavior. +- `docs/adr/2026-06-18-opik-post-run-export-boundary.md` for the adapter boundary + that keeps AgentV run bundles canonical. +- Backblaze B2 S3-compatible docs: + `https://www.backblaze.com/docs/cloud-storage-call-the-s3-compatible-api` and + `https://www.backblaze.com/apidocs/introduction-to-the-s3-compatible-api`. +- AWS S3 `ListObjectsV2` docs: + `https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html`. +- Bitwarden Secrets Manager CLI docs: + `https://bitwarden.com/help/secrets-manager-cli/`. diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 41ee9610e..213940190 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -129,6 +129,21 @@ function usesFileReferencePrompt(provider: Provider): boolean { return isAgentProvider(provider) || provider.kind === 'cli'; } +function extractProviderRawLogPath(response: ProviderResponse): string | undefined { + const raw = response.raw; + if (!raw || typeof raw !== 'object' || Array.isArray(raw)) { + return undefined; + } + + const logFile = (raw as Record).logFile; + if (typeof logFile !== 'string') { + return undefined; + } + + const trimmed = logFile.trim(); + return trimmed.length > 0 ? trimmed : undefined; +} + interface EvaluationRuntimeOptions { readonly target: ResolvedTarget; readonly targets?: readonly TargetDefinition[]; @@ -1588,6 +1603,7 @@ async function runBatchEvaluation(options: { const tokenUsage = merged?.tokenUsage; const startTime = merged?.startTime; const endTime = merged?.endTime; + const rawProviderLogPath = extractProviderRawLogPath(providerResponse); // Extract candidate from last assistant message in output const candidate = extractLastAssistantContent(output); @@ -1615,6 +1631,7 @@ async function runBatchEvaluation(options: { tokenUsage, startTime, endTime, + rawProviderLogPath, targetResolver, availableTargets, verbose, @@ -1982,6 +1999,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise Provider | undefined; readonly availableTargets?: readonly string[]; readonly fileChanges?: string; @@ -2404,6 +2424,7 @@ async function evaluateCandidate(options: { tokenUsage, startTime, endTime, + rawProviderLogPath, targetResolver, availableTargets, fileChanges, @@ -2514,6 +2535,7 @@ async function evaluateCandidate(options: { output: candidate, scores: scores, trace: evaluationTrace, + rawProviderLogPath, fileChanges, executionStatus: classifyQualityStatus(score.score, evalThreshold), }; diff --git a/packages/core/src/evaluation/result-artifact-contract.ts b/packages/core/src/evaluation/result-artifact-contract.ts new file mode 100644 index 000000000..a8f122bf5 --- /dev/null +++ b/packages/core/src/evaluation/result-artifact-contract.ts @@ -0,0 +1,108 @@ +/** + * AgentV-owned result artifact contract. + * + * This module centralizes the git refs and portable pointer shapes used by run + * records. Local run workspaces still write their files under the existing + * per-result artifact directories; these pointers describe where those same + * AgentV-owned artifacts belong when projected to a results ref, sidecar ref, + * or object store. + */ + +export const AGENTV_RESULTS_PRIMARY_REF = 'agentv/results/v1' as const; +export const AGENTV_RESULTS_ARTIFACTS_REF = 'agentv/artifacts/v1' as const; +export const AGENTV_RESULTS_OPLOG_REF = 'agentv/oplog/v1' as const; + +export const AGENTV_RESULTS_REFS = { + primary: AGENTV_RESULTS_PRIMARY_REF, + artifacts: AGENTV_RESULTS_ARTIFACTS_REF, + oplog: AGENTV_RESULTS_OPLOG_REF, +} as const; + +export const CANONICAL_TRACE_ARTIFACT_PATH = 'outputs/trace.json' as const; +export const CANONICAL_TRANSCRIPT_ARTIFACT_PATH = 'outputs/transcript.jsonl' as const; + +export const TRANSCRIPT_SCHEMA_VERSION = 'agentv.transcript.v1' as const; +export const TRANSCRIPT_JSONL_MEDIA_TYPE = 'application/x-ndjson' as const; +export const TRACE_JSON_MEDIA_TYPE = 'application/vnd.agentv.trace.v1+json' as const; + +export type AgentVResultsRefName = (typeof AGENTV_RESULTS_REFS)[keyof typeof AGENTV_RESULTS_REFS]; + +export type ResultArtifactFamily = + | 'traces' + | 'transcripts' + | 'outputs' + | 'raw-logs' + | 'screenshots'; + +export interface ResultArtifactPointer { + readonly ref: AgentVResultsRefName | string; + readonly key: string; + readonly objectVersion: string; + readonly path: string; + readonly sha256: string; + readonly size: number; + readonly schemaVersion: string; + readonly mediaType: string; + readonly family?: ResultArtifactFamily; +} + +export interface ResultArtifactPointerWire { + readonly ref: AgentVResultsRefName | string; + readonly key: string; + readonly object_version: string; + readonly path: string; + readonly sha256: string; + readonly size: number; + readonly schema_version: string; + readonly media_type: string; + readonly family?: ResultArtifactFamily; +} + +export type TranscriptArtifactPointer = ResultArtifactPointer & { + readonly schemaVersion: typeof TRANSCRIPT_SCHEMA_VERSION; + readonly mediaType: typeof TRANSCRIPT_JSONL_MEDIA_TYPE; + readonly family: 'transcripts'; +}; + +export type TranscriptArtifactPointerWire = ResultArtifactPointerWire & { + readonly schema_version: typeof TRANSCRIPT_SCHEMA_VERSION; + readonly media_type: typeof TRANSCRIPT_JSONL_MEDIA_TYPE; + readonly family: 'transcripts'; +}; + +export interface ResultArtifactPointersWire { + readonly trace?: ResultArtifactPointerWire; + readonly transcript?: TranscriptArtifactPointerWire; +} + +export function toResultArtifactPointerWire( + pointer: ResultArtifactPointer, +): ResultArtifactPointerWire { + return { + ref: pointer.ref, + key: pointer.key, + object_version: pointer.objectVersion, + path: pointer.path, + sha256: pointer.sha256, + size: pointer.size, + schema_version: pointer.schemaVersion, + media_type: pointer.mediaType, + family: pointer.family, + }; +} + +export function fromResultArtifactPointerWire( + pointer: ResultArtifactPointerWire, +): ResultArtifactPointer { + return { + ref: pointer.ref, + key: pointer.key, + objectVersion: pointer.object_version, + path: pointer.path, + sha256: pointer.sha256, + size: pointer.size, + schemaVersion: pointer.schema_version, + mediaType: pointer.media_type, + family: pointer.family, + }; +} diff --git a/packages/core/src/evaluation/result-row-schema.ts b/packages/core/src/evaluation/result-row-schema.ts index fff7ef0e0..64a82e3e0 100644 --- a/packages/core/src/evaluation/result-row-schema.ts +++ b/packages/core/src/evaluation/result-row-schema.ts @@ -34,6 +34,7 @@ const RESULT_ROW_ALIASES = { gradingPath: 'grading_path', inputPath: 'input_path', outputPath: 'output_path', + rawProviderLogPath: 'raw_provider_log_path', responsePath: 'response_path', startTime: 'start_time', targetsPath: 'targets_path', @@ -45,6 +46,10 @@ const RESULT_ROW_ALIASES = { workspacePath: 'workspace_path', } as const; +const NEW_SNAKE_CASE_ONLY_FIELDS = { + artifactPointers: 'artifact_pointers', +} as const; + const TRACE_SUMMARY_ALIASES = { costUsd: 'cost_usd', durationMs: 'duration_ms', @@ -149,6 +154,19 @@ function buildInvalidScoreError(context: { return new ResultRowSchemaError(`Missing or invalid score in result row${location}.`); } +function buildSnakeCaseOnlyFieldError( + field: keyof typeof NEW_SNAKE_CASE_ONLY_FIELDS, + context: { lineNumber?: number; sourceLabel?: string }, +): ResultRowSchemaError { + const location = [ + context.sourceLabel ? ` in ${context.sourceLabel}` : '', + context.lineNumber !== undefined ? ` at line ${context.lineNumber}` : '', + ].join(''); + return new ResultRowSchemaError( + `Unsupported camelCase result row field "${field}"${location}. Use "${NEW_SNAKE_CASE_ONLY_FIELDS[field]}".`, + ); +} + function looksLikeResultRow(value: Record): boolean { return ( typeof value.test_id === 'string' || @@ -169,6 +187,14 @@ export function normalizeResultRow( throw buildSchemaError(context); } + for (const field of Object.keys( + NEW_SNAKE_CASE_ONLY_FIELDS, + ) as (keyof typeof NEW_SNAKE_CASE_ONLY_FIELDS)[]) { + if (Object.hasOwn(value, field)) { + throw buildSnakeCaseOnlyFieldError(field, context); + } + } + const normalized = normalizeKnownAliases(value, RESULT_ROW_ALIASES); if (normalized.trace !== undefined) { normalized.trace = normalizeTraceSummary(normalized.trace); diff --git a/packages/core/src/evaluation/results-repo.ts b/packages/core/src/evaluation/results-repo.ts index bbe511487..11f7f5bd2 100644 --- a/packages/core/src/evaluation/results-repo.ts +++ b/packages/core/src/evaluation/results-repo.ts @@ -15,6 +15,7 @@ import { promisify } from 'node:util'; import { getAgentvDataDir } from '../paths.js'; import type { ResultsConfig } from './loaders/config-loader.js'; +import { AGENTV_RESULTS_PRIMARY_REF } from './result-artifact-contract.js'; const execFileAsync = promisify(execFile); // Local working-tree run workspace inside the eval repo. Local commands @@ -32,7 +33,7 @@ const RESULTS_REPO_METADATA_DIR = 'metadata'; const RESULTS_REPO_TRACKED_DIRS = [RESULTS_REPO_RUNS_DIR, RESULTS_REPO_METADATA_DIR] as const; const RESULTS_REPO_COMMIT_EMAIL = 'agentv@results-repo'; const RESULTS_REPO_COMMIT_NAME = 'AgentV Results'; -export const DEFAULT_RESULTS_BRANCH = 'agentv/results/v1'; +export const DEFAULT_RESULTS_BRANCH = AGENTV_RESULTS_PRIMARY_REF; const GIT_EMPTY_TREE = '4b825dc642cb6eb9a060e54bf8d69288fbee4904'; // The results branch is a self-rooted orphan whose first commit is a fixed, // byte-identical empty-tree genesis. Pinning the message, identity (see diff --git a/packages/core/src/evaluation/run-artifacts.ts b/packages/core/src/evaluation/run-artifacts.ts index c15fb205d..af93cc692 100644 --- a/packages/core/src/evaluation/run-artifacts.ts +++ b/packages/core/src/evaluation/run-artifacts.ts @@ -7,7 +7,8 @@ * snake_case here so every caller produces the same artifacts. */ -import { mkdir, readFile, writeFile } from 'node:fs/promises'; +import { createHash } from 'node:crypto'; +import { copyFile, mkdir, readFile, writeFile } from 'node:fs/promises'; import path from 'node:path'; import { traceEnvelopeToTranscriptJsonLines } from '../import/types.js'; @@ -22,8 +23,22 @@ import { } from './projection-identity.js'; import type { Message } from './providers/types.js'; import { extractLastAssistantContent } from './providers/types.js'; +import { + AGENTV_RESULTS_ARTIFACTS_REF, + CANONICAL_TRACE_ARTIFACT_PATH, + CANONICAL_TRANSCRIPT_ARTIFACT_PATH, + type ResultArtifactFamily, + type ResultArtifactPointerWire, + type ResultArtifactPointersWire, + TRACE_JSON_MEDIA_TYPE, + TRANSCRIPT_JSONL_MEDIA_TYPE, + TRANSCRIPT_SCHEMA_VERSION, + type TranscriptArtifactPointerWire, + toResultArtifactPointerWire, +} from './result-artifact-contract.js'; import { normalizeResultRow } from './result-row-schema.js'; import { + EXECUTION_TRACE_SCHEMA_VERSION, type TraceEnvelope, buildTraceEnvelopeFromEvaluationResult, toTraceEnvelopeWire, @@ -205,6 +220,8 @@ export interface IndexArtifactEntry { readonly output_path?: string; readonly answer_path?: string; readonly transcript_path?: string; + readonly artifact_pointers?: ResultArtifactPointersWire; + readonly raw_provider_log_path?: string; readonly input_path?: string; readonly response_path?: string; readonly task_dir?: string; @@ -226,7 +243,12 @@ export type ResultIndexArtifact = IndexArtifactEntry; export type AdditionalResultIndexFields = Partial< Pick< IndexArtifactEntry, - 'task_dir' | 'eval_path' | 'targets_path' | 'files_path' | 'graders_path' + | 'task_dir' + | 'eval_path' + | 'targets_path' + | 'files_path' + | 'graders_path' + | 'raw_provider_log_path' > >; @@ -351,7 +373,7 @@ function toIndexScore(score: GraderResult): Record { score: score.score, weight: score.weight, verdict: score.verdict, - assertions: score.assertions.map(toIndexAssertion), + assertions: (score.assertions ?? []).map(toIndexAssertion), raw_request: score.rawRequest, input: score.input, target: score.target, @@ -732,6 +754,26 @@ function resultHasExecutionTraceTranscript(result: EvaluationResult): boolean { return result.output.length > 0 || result.trace.messages.length > 0; } +function rawProviderLogSourcePath(result: EvaluationResult): string | undefined { + const sourcePath = result.rawProviderLogPath?.trim(); + return sourcePath ? sourcePath : undefined; +} + +function rawProviderLogArtifactPath(outputsDir: string): string { + return path.join(outputsDir, 'raw', 'provider.log'); +} + +async function copyRawProviderLogArtifact(sourcePath: string, outputsDir: string): Promise { + const destinationPath = rawProviderLogArtifactPath(outputsDir); + if (path.resolve(sourcePath) === path.resolve(destinationPath)) { + return destinationPath; + } + + await mkdir(path.dirname(destinationPath), { recursive: true }); + await copyFile(sourcePath, destinationPath); + return destinationPath; +} + interface TraceEnvelopeSidecarParams { readonly result: EvaluationResult; readonly outputDir: string; @@ -751,10 +793,13 @@ function buildTraceEnvelopeSidecar(params: TraceEnvelopeSidecarParams): TraceEnv source: { path: RESULT_INDEX_FILENAME }, capture: { content: 'full', redactionLevel: 'none', redactedFields: [] }, artifacts: { - trace_path: 'outputs/trace.json', + trace_path: CANONICAL_TRACE_ARTIFACT_PATH, answer_path: params.result.output.length > 0 ? 'outputs/answer.md' : undefined, response_path: params.result.output.length > 0 ? 'outputs/response.md' : undefined, - transcript_path: hasTranscript ? 'outputs/transcript.jsonl' : undefined, + transcript_path: hasTranscript ? CANONICAL_TRANSCRIPT_ARTIFACT_PATH : undefined, + raw_provider_log_path: rawProviderLogSourcePath(params.result) + ? 'outputs/raw/provider.log' + : undefined, }, duplicatePolicy: params.duplicatePolicy, }); @@ -772,6 +817,72 @@ async function writeTraceEnvelopeSidecar( return envelope; } +function buildSidecarArtifactKey(family: ResultArtifactFamily, runRelativePath: string): string { + return path.posix.join(family, runRelativePath); +} + +async function buildArtifactPointer(params: { + readonly filePath: string; + readonly runRelativePath: string; + readonly family: ResultArtifactFamily; + readonly schemaVersion: string; + readonly mediaType: string; +}): Promise { + const content = await readFile(params.filePath); + const sha256 = createHash('sha256').update(content).digest('hex'); + return toResultArtifactPointerWire({ + ref: AGENTV_RESULTS_ARTIFACTS_REF, + key: buildSidecarArtifactKey(params.family, params.runRelativePath), + objectVersion: `sha256:${sha256}`, + path: params.runRelativePath, + sha256, + size: content.byteLength, + schemaVersion: params.schemaVersion, + mediaType: params.mediaType, + family: params.family, + }); +} + +async function buildTracePointer( + outputDir: string, + tracePath: string, +): Promise { + return buildArtifactPointer({ + filePath: tracePath, + runRelativePath: toRelativeArtifactPath(outputDir, tracePath), + family: 'traces', + schemaVersion: EXECUTION_TRACE_SCHEMA_VERSION, + mediaType: TRACE_JSON_MEDIA_TYPE, + }); +} + +async function buildTranscriptPointer( + outputDir: string, + transcriptPath: string, +): Promise { + const pointer = await buildArtifactPointer({ + filePath: transcriptPath, + runRelativePath: toRelativeArtifactPath(outputDir, transcriptPath), + family: 'transcripts', + schemaVersion: TRANSCRIPT_SCHEMA_VERSION, + mediaType: TRANSCRIPT_JSONL_MEDIA_TYPE, + }); + return pointer as TranscriptArtifactPointerWire; +} + +async function buildArtifactPointers(params: { + readonly outputDir: string; + readonly tracePath: string; + readonly transcriptPath?: string; +}): Promise { + return { + trace: await buildTracePointer(params.outputDir, params.tracePath), + ...(params.transcriptPath + ? { transcript: await buildTranscriptPointer(params.outputDir, params.transcriptPath) } + : {}), + }; +} + export function buildIndexArtifactEntry( result: EvaluationResult, options: { @@ -782,6 +893,8 @@ export function buildIndexArtifactEntry( outputPath?: string; answerPath?: string; transcriptPath?: string; + artifactPointers?: ResultArtifactPointersWire; + rawProviderLogPath?: string; inputPath?: string; responsePath?: string; extraIndexFields?: AdditionalResultIndexFields; @@ -822,6 +935,10 @@ export function buildIndexArtifactEntry( transcript_path: options.transcriptPath ? toRelativeArtifactPath(options.outputDir, options.transcriptPath) : undefined, + raw_provider_log_path: options.rawProviderLogPath + ? toRelativeArtifactPath(options.outputDir, options.rawProviderLogPath) + : undefined, + artifact_pointers: options.artifactPointers, input_path: options.inputPath ? toRelativeArtifactPath(options.outputDir, options.inputPath) : undefined, @@ -843,12 +960,14 @@ export function buildResultIndexArtifact( options?: { projectionIdentity?: ProjectionIdentity; duplicatePolicy?: ExportDuplicatePolicy; + artifactPointers?: ResultArtifactPointersWire; }, ): ResultIndexArtifact { const artifactSubdir = buildArtifactSubdir(result); const input = extractInput(result); const hasAnswer = result.output.length > 0; const hasTranscript = resultHasExecutionTraceTranscript(result); + const hasRawProviderLog = rawProviderLogSourcePath(result) !== undefined; return { timestamp: result.timestamp, @@ -878,6 +997,10 @@ export function buildResultIndexArtifact( transcript_path: hasTranscript ? path.posix.join(artifactSubdir, 'outputs', 'transcript.jsonl') : undefined, + raw_provider_log_path: hasRawProviderLog + ? path.posix.join(artifactSubdir, 'outputs', 'raw', 'provider.log') + : undefined, + artifact_pointers: options?.artifactPointers, response_path: hasAnswer ? path.posix.join(artifactSubdir, 'outputs', 'response.md') : undefined, @@ -1122,6 +1245,8 @@ function normalizeParsedResult(value: unknown): ParsedEvaluationResult | undefin } const result = value as Record; + const parsedResult = { ...result }; + parsedResult.rawProviderLogPath = undefined; const legacyOutputMessages = Array.isArray(result.output) ? result.output.filter(isOutputMessage) : undefined; @@ -1148,7 +1273,7 @@ function normalizeParsedResult(value: unknown): ParsedEvaluationResult | undefin }); return { - ...result, + ...parsedResult, timestamp: typeof result.timestamp === 'string' ? result.timestamp : new Date(0).toISOString(), testId: typeof result.testId === 'string' ? result.testId : 'unknown', score: typeof result.score === 'number' ? result.score : 0, @@ -1263,6 +1388,11 @@ export async function writePerTestArtifacts( await writeFile(path.join(outputsDir, 'answer.md'), result.output, 'utf8'); await writeFile(path.join(outputsDir, 'response.md'), result.output, 'utf8'); } + const rawProviderLogSource = rawProviderLogSourcePath(result); + if (rawProviderLogSource) { + await copyRawProviderLogArtifact(rawProviderLogSource, outputsDir); + } + const tracePath = path.join(outputsDir, 'trace.json'); const envelope = await writeTraceEnvelopeSidecar({ result, outputDir, @@ -1272,9 +1402,17 @@ export async function writePerTestArtifacts( runId: options?.runId, duplicatePolicy, }); - if (hasTranscriptProjection(result, envelope)) { - await writeTranscriptJsonl(path.join(outputsDir, 'transcript.jsonl'), result, envelope); + const transcriptPath = hasTranscriptProjection(result, envelope) + ? path.join(outputsDir, 'transcript.jsonl') + : undefined; + if (transcriptPath) { + await writeTranscriptJsonl(transcriptPath, result, envelope); } + const artifactPointers = await buildArtifactPointers({ + outputDir, + tracePath, + transcriptPath, + }); const extraIndexFields = await collectAdditionalIndexFields( result, @@ -1288,6 +1426,7 @@ export async function writePerTestArtifacts( ...buildResultIndexArtifact(result, extraIndexFields, { projectionIdentity: envelope.projectionIdentity, duplicatePolicy, + artifactPointers, }), experiment: options?.experiment, }); @@ -1351,6 +1490,11 @@ export async function writeArtifactsFromResults( const transcriptPath = hasTranscriptProjection(result, envelope) ? path.join(outputsDir, 'transcript.jsonl') : undefined; + const tracePath = path.join(outputsDir, 'trace.json'); + const rawProviderLogSource = rawProviderLogSourcePath(result); + const rawProviderLogPath = rawProviderLogSource + ? rawProviderLogArtifactPath(outputsDir) + : undefined; const projectionIdentity = envelope.projectionIdentity; if (!projectionIdentity) { throw new Error(`Result ${result.testId ?? 'unknown'} is missing projection identity`); @@ -1368,9 +1512,12 @@ export async function writeArtifactsFromResults( outputsDir, answerPath, responsePath, + tracePath, envelope, projectionIdentity, transcriptPath, + rawProviderLogSource, + rawProviderLogPath, identityId, }; }); @@ -1416,14 +1563,22 @@ export async function writeArtifactsFromResults( await writeFile(plan.answerPath, result.output, 'utf8'); await writeFile(plan.responsePath, result.output, 'utf8'); } + if (plan.rawProviderLogSource) { + await copyRawProviderLogArtifact(plan.rawProviderLogSource, plan.outputsDir); + } await writeFile( - path.join(plan.outputsDir, 'trace.json'), + plan.tracePath, `${JSON.stringify(toTraceEnvelopeWire(envelope), null, 2)}\n`, 'utf8', ); if (plan.transcriptPath) { await writeTranscriptJsonl(plan.transcriptPath, result, envelope); } + const artifactPointers = await buildArtifactPointers({ + outputDir, + tracePath: plan.tracePath, + transcriptPath: plan.transcriptPath, + }); const extraIndexFields = await collectAdditionalIndexFields( result, @@ -1442,6 +1597,8 @@ export async function writeArtifactsFromResults( outputPath: plan.answerPath, answerPath: plan.answerPath, transcriptPath: plan.transcriptPath, + artifactPointers, + rawProviderLogPath: plan.rawProviderLogPath, inputPath: plan.inputPath, responsePath: plan.responsePath, extraIndexFields, diff --git a/packages/core/src/evaluation/trace-envelope.ts b/packages/core/src/evaluation/trace-envelope.ts index ba3244139..c1fbb961e 100644 --- a/packages/core/src/evaluation/trace-envelope.ts +++ b/packages/core/src/evaluation/trace-envelope.ts @@ -775,7 +775,7 @@ function scoresFromResult( targetSpanId, evidence: dropUndefined({ span_ids: [targetSpanId], - assertions: score.assertions.map((assertion) => + assertions: (score.assertions ?? []).map((assertion) => dropUndefined({ text: assertion.text, passed: assertion.passed, diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index 5d1e139ca..01b63b398 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -1184,6 +1184,12 @@ export interface EvaluationResult { readonly error?: string; /** Canonical execution trace: messages, events, metrics, and provider provenance. */ readonly trace: Trace; + /** + * Optional local provider-native session/stream log captured by a provider. + * Artifact writers copy this byte-for-byte into the run bundle as raw, + * non-canonical evidence and expose only the run-local pointer. + */ + readonly rawProviderLogPath?: string; /** Path to the temporary workspace directory (included on failure for debugging) */ readonly workspacePath?: string; /** Input messages sent to the agent. Always Message[] for consistent shape with output. */ diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 0b351ca58..c0e10451d 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -9,6 +9,7 @@ export { ResultRowSchemaError, normalizeResultRow, } from './evaluation/result-row-schema.js'; +export * from './evaluation/result-artifact-contract.js'; export { parseYamlValue } from './evaluation/yaml-loader.js'; export * from './evaluation/yaml-parser.js'; export { diff --git a/packages/core/test/evaluation/loaders/agent-skills-parser.test.ts b/packages/core/test/evaluation/loaders/agent-skills-parser.test.ts index 166542fc5..40183638e 100644 --- a/packages/core/test/evaluation/loaders/agent-skills-parser.test.ts +++ b/packages/core/test/evaluation/loaders/agent-skills-parser.test.ts @@ -218,6 +218,24 @@ describe('parseAgentSkillsEvals', () => { expect(tests[0].metadata).toBeUndefined(); }); + it('ignores transcript artifact-looking fields in evals.json cases', () => { + const tests = parseAgentSkillsEvals({ + evals: [ + { + id: 1, + prompt: 'test prompt', + transcript_path: 'outputs/transcript.jsonl', + raw_provider_log_path: 'outputs/raw/provider.log', + }, + ], + }); + + expect(tests).toHaveLength(1); + expect(tests[0].metadata).toBeUndefined(); + expect(tests[0]).not.toHaveProperty('transcript_path'); + expect(tests[0]).not.toHaveProperty('raw_provider_log_path'); + }); + it('includes source in error messages', () => { expect(() => parseAgentSkillsEvals({}, 'my-evals.json')).toThrow('my-evals.json'); }); diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts index 768db40b8..a7c71ad39 100644 --- a/packages/core/test/evaluation/orchestrator.test.ts +++ b/packages/core/test/evaluation/orchestrator.test.ts @@ -23,6 +23,7 @@ import { type ReplayFixtureRecord, serializeReplayFixtureRecord, } from '../../src/evaluation/replay-fixtures.js'; +import { writeArtifactsFromResults } from '../../src/evaluation/run-artifacts.js'; import { RunBudgetTracker } from '../../src/evaluation/run-budget-tracker.js'; import { buildTraceEnvelopeFromEvaluationResult, @@ -722,6 +723,49 @@ console.log('spreadsheet: revenue,total\\nQ1,42');`, expect(result.failureReasonCode).toBe('provider_error'); }); + it('copies and indexes raw provider logs from normal per-case evaluation artifacts', async () => { + const tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-raw-provider-log-')); + const rawLogPath = path.join(tempDir, 'provider-native-session.jsonl'); + writeFileSync(rawLogPath, '{"event":"provider-native"}\n', 'utf8'); + + const provider = new SequenceProvider('mock', { + responses: [ + { + output: [{ role: 'assistant', content: 'Raw log evidence preserved.' }], + raw: { logFile: rawLogPath }, + }, + ], + }); + + const result = await runEvalCase({ + evalCase: baseTestCase, + provider, + target: baseTarget, + evaluators: evaluatorRegistry, + }); + + expect(result.rawProviderLogPath).toBe(rawLogPath); + + const outputDir = path.join(tempDir, 'artifacts'); + await writeArtifactsFromResults([result], outputDir); + + const outputsDir = path.join(outputDir, 'test-dataset', 'case-1', 'outputs'); + expect(readFileSync(path.join(outputsDir, 'raw', 'provider.log'), 'utf8')).toBe( + '{"event":"provider-native"}\n', + ); + expect(readdirSync(outputsDir)).toContain('transcript.jsonl'); + expect(readdirSync(outputsDir)).not.toContain('transcript.json'); + + const indexRows = readFileSync(path.join(outputDir, 'index.jsonl'), 'utf8') + .trim() + .split('\n') + .map((line) => JSON.parse(line) as Record); + expect(indexRows[0]?.raw_provider_log_path).toBe( + 'test-dataset/case-1/outputs/raw/provider.log', + ); + expect(indexRows[0]?.transcript_path).toBe('test-dataset/case-1/outputs/transcript.jsonl'); + }); + it('reports failed progress status for batch item errors', async () => { class BatchProvider implements Provider { readonly id = 'batch:mock'; diff --git a/packages/core/test/evaluation/results-repo.test.ts b/packages/core/test/evaluation/results-repo.test.ts index c006c4e4c..c66852b8f 100644 --- a/packages/core/test/evaluation/results-repo.test.ts +++ b/packages/core/test/evaluation/results-repo.test.ts @@ -6,6 +6,7 @@ import path from 'node:path'; import { afterEach, beforeEach, describe, expect, it } from 'bun:test'; import type { ResultsConfig } from '../../src/evaluation/loaders/config-loader.js'; +import { AGENTV_RESULTS_REFS } from '../../src/evaluation/result-artifact-contract.js'; import { DEFAULT_RESULTS_BRANCH, buildWipBranchName, @@ -51,6 +52,17 @@ function createResultsConfig(repoDir: string, cloneDir: string): ResultsConfig { }; } +function refsHavePrefixConflict(refs: readonly string[]): boolean { + for (const ref of refs) { + for (const other of refs) { + if (ref !== other && other.startsWith(`${ref}/`)) { + return true; + } + } + } + return false; +} + function initializeRemoteRepo(rootDir: string): { remoteDir: string; seedDir: string } { const remoteDir = path.join(rootDir, 'results-remote.git'); git(`git init --bare --initial-branch=main --quiet "${remoteDir}"`, rootDir); @@ -332,6 +344,13 @@ describe('results repo write path', () => { ); expect(DEFAULT_RESULTS_BRANCH).toBe('agentv/results/v1'); + expect(DEFAULT_RESULTS_BRANCH).toBe(AGENTV_RESULTS_REFS.primary); + expect(AGENTV_RESULTS_REFS).toEqual({ + primary: 'agentv/results/v1', + artifacts: 'agentv/artifacts/v1', + oplog: 'agentv/oplog/v1', + }); + expect(refsHavePrefixConflict(Object.values(AGENTV_RESULTS_REFS))).toBe(false); expect(normalized.branch).toBe('agentv/results/v1'); expect(normalized.repo_path).toBe('/tmp/source-project'); expect(normalized.auto_push).toBe(false);