Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
4872b7f
feat(results): add artifact pointer contract
christso Jun 21, 2026
0aafea0
feat(results): expose run oplog state watermark
christso Jun 21, 2026
add584d
feat(dashboard): lazy load transcript artifacts
christso Jun 21, 2026
8fdf9bd
docs(results): specify av-quf storage plan
christso Jun 21, 2026
8845406
feat(dashboard): add trace session read model
christso Jun 21, 2026
b20f17f
feat(results): add projection bundle export
christso Jun 21, 2026
79e22b4
feat(artifacts): preserve raw provider logs beside transcripts
christso Jun 21, 2026
f02bcae
docs(results): avoid ref prefix conflicts
christso Jun 21, 2026
61d4ede
fix(results): avoid ref prefix conflicts
christso Jun 21, 2026
3501891
fix(results): avoid prefix-conflicting oplog ref
christso Jun 21, 2026
d13e8ab
fix(artifacts): preserve raw logs in per-case evals
christso Jun 21, 2026
f8d70fe
fix(results): preserve local tag clear watermark
christso Jun 21, 2026
d24c970
fix(dashboard): harden trace session read model
christso Jun 21, 2026
cbb5979
fix(results): preserve artifact pointers when combining runs
christso Jun 21, 2026
bc9c393
fix(results): align projection bundle refs with exports
christso Jun 21, 2026
c69f53d
merge av-quf-storage-plan
christso Jun 21, 2026
756777c
merge av-vwa-16-10-artifact-contract
christso Jun 21, 2026
924d07e
merge av-vwa-16-9-transcript-boundary
christso Jun 21, 2026
2295c81
merge av-vwa-16-10-oplog-state
christso Jun 21, 2026
fdc634d
merge av-kve-1-read-model
christso Jun 21, 2026
7c4c3f7
merge av-vwa-16-10-dashboard-transcript
christso Jun 21, 2026
813d352
merge av-vwa-16-4-export-bundle
christso Jun 21, 2026
ce7cacf
fix(results): finalize reviewed stack integration
christso Jun 21, 2026
b25b047
fix(results): tolerate grader scores without assertions
christso Jun 21, 2026
e331bda
docs(dogfood): record reviewed stack results
christso Jun 21, 2026
979f625
fix(results): preserve portable reviewed artifact contracts
christso Jun 21, 2026
be40cd9
fix(results): block symlink artifact escapes
christso Jun 21, 2026
42dde84
merge fix-integration-contract-review
christso Jun 21, 2026
7b49340
merge fix-serve-symlink-artifacts
christso Jun 21, 2026
825518e
merge dogfood-integration-av-vwa-16-10
christso Jun 21, 2026
93ba064
merge final reviewed results stack
christso Jun 21, 2026
2226913
style: format reviewed results stack
christso Jun 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions apps/cli/src/commands/eval/artifact-writer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ export function buildIndexArtifactEntry(
outputPath?: string;
answerPath?: string;
transcriptPath?: string;
rawProviderLogPath?: string;
inputPath?: string;
responsePath?: string;
taskBundle?: MaterializedTaskBundlePaths;
Expand Down
16 changes: 11 additions & 5 deletions apps/cli/src/commands/eval/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,16 @@ export function trimOutputMessages(
return output;
}

export function prepareResultForJsonl(
result: EvaluationResult,
options: { readonly outputMessages: number | 'all' },
): EvaluationResult {
return {
...result,
output: trimOutputMessages(result.output, options.outputMessages),
};
}

function normalizeOptions(
rawOptions: Record<string, unknown>,
config?: Awaited<ReturnType<typeof loadTsConfig>>,
Expand Down Expand Up @@ -1043,11 +1053,7 @@ async function runSingleEvalFile(params: {
// Each message is trimmed to { role, content } only (no toolCalls, startTime, etc.).
// Full output with tool calls goes to OTel.
const resultWithMetadata = withSourceMetadata(result, testFilePath, options);
const trimmedOutput = trimOutputMessages(resultWithMetadata.output, options.outputMessages);
const trimmedResult: EvaluationResult = {
...resultWithMetadata,
output: trimmedOutput,
};
const trimmedResult = prepareResultForJsonl(resultWithMetadata, options);
await outputWriter.append(trimmedResult);

// Export to OTel if exporter is configured (skip batch export when streaming is active)
Expand Down
4 changes: 2 additions & 2 deletions apps/cli/src/commands/inspect/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import {
resolveExistingRunPrimaryPath,
resolveWorkspaceOrFilePath,
} from '../eval/result-layout.js';
import { loadManifestResults } from '../results/manifest.js';
import { loadLightweightResults, loadManifestResults } from '../results/manifest.js';
import { ResultRowSchemaError, normalizeResultRow } from '../results/result-row-schema.js';

// ANSI color codes (no dependency needed)
Expand Down Expand Up @@ -636,7 +636,7 @@ export function listResultFilesFromRunsDir(runsDir: string, limit?: number): Res
for (const { filePath, displayName, runId } of limited) {
try {
const fileStat = statSync(filePath);
const results = loadResultFile(filePath);
const results = loadLightweightResults(filePath);

const testCount = results.length;
const passCount = results.filter((r) => r.score >= DEFAULT_THRESHOLD).length;
Expand Down
100 changes: 98 additions & 2 deletions apps/cli/src/commands/results/combine-run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,12 @@ import {
} from 'node:fs';
import path from 'node:path';

import type { EvaluationResult } from '@agentv/core';
import type {
EvaluationResult,
ResultArtifactPointerWire,
ResultArtifactPointersWire,
TranscriptArtifactPointerWire,
} from '@agentv/core';

import {
type BenchmarkArtifact,
Expand Down Expand Up @@ -305,26 +310,38 @@ function toRunId(cwd: string, runDir: string): string {
}

const MANIFEST_PATH_FIELDS = [
'artifact_dir',
'grading_path',
'timing_path',
'input_path',
'output_path',
'response_path',
'transcript_path',
'raw_provider_log_path',
'task_dir',
'eval_path',
'targets_path',
'files_path',
'graders_path',
] as const;

const POINTER_FAMILIES = {
trace: 'traces',
transcript: 'transcripts',
} as const;

function isSafeRelativeArtifactPath(relativePath: string): boolean {
return !path.isAbsolute(relativePath) && !relativePath.split(/[\\/]+/).includes('..');
}

function copyReferencedArtifact(
sourceBaseDir: string,
outputDir: string,
sourceIndex: number,
relativePath: string | undefined,
): string | undefined {
if (!relativePath) return undefined;
if (path.isAbsolute(relativePath) || relativePath.split(/[\\/]+/).includes('..')) {
if (!isSafeRelativeArtifactPath(relativePath)) {
throw new Error(`Unsafe artifact path in source manifest: ${relativePath}`);
}
const sourcePath = path.join(sourceBaseDir, relativePath);
Expand All @@ -343,6 +360,71 @@ function copyReferencedArtifact(
return rewritten;
}

function rewriteArtifactPointer(
pointerName: keyof typeof POINTER_FAMILIES,
pointer: ResultArtifactPointerWire | undefined,
sourceBaseDir: string,
outputDir: string,
sourceIndex: number,
): ResultArtifactPointerWire | undefined {
if (!pointer) {
return undefined;
}

if (!isSafeRelativeArtifactPath(pointer.path)) {
throw new Error(`Unsafe artifact path in source manifest: ${pointer.path}`);
}
const sourcePath = path.join(sourceBaseDir, pointer.path);
if (!existsSync(sourcePath)) {
return { ...pointer };
}

const rewrittenPath = copyReferencedArtifact(sourceBaseDir, outputDir, sourceIndex, pointer.path);
if (!rewrittenPath) {
return { ...pointer };
}

const family = pointer.family ?? POINTER_FAMILIES[pointerName];
return {
...pointer,
path: rewrittenPath,
key: path.posix.join(family, rewrittenPath),
};
}

function rewriteTranscriptArtifactPointer(
pointer: TranscriptArtifactPointerWire | undefined,
sourceBaseDir: string,
outputDir: string,
sourceIndex: number,
): TranscriptArtifactPointerWire | undefined {
return rewriteArtifactPointer('transcript', pointer, sourceBaseDir, outputDir, sourceIndex) as
| TranscriptArtifactPointerWire
| undefined;
}

function rewriteArtifactPointers(
pointers: ResultArtifactPointersWire | undefined,
sourceBaseDir: string,
outputDir: string,
sourceIndex: number,
): ResultArtifactPointersWire | undefined {
if (!pointers) {
return undefined;
}

return {
...pointers,
trace: rewriteArtifactPointer('trace', pointers.trace, sourceBaseDir, outputDir, sourceIndex),
transcript: rewriteTranscriptArtifactPointer(
pointers.transcript,
sourceBaseDir,
outputDir,
sourceIndex,
),
};
}

function rewriteAndCopyRecord(row: SelectedRow, outputDir: string): ResultManifestRecord {
const sourceBaseDir = path.dirname(row.source.manifestPath);
const rewritten: Record<string, unknown> = { ...row.record };
Expand All @@ -354,6 +436,20 @@ function rewriteAndCopyRecord(row: SelectedRow, outputDir: string): ResultManife
row.record[field],
);
}
const artifactPointers = rewriteArtifactPointers(
row.record.artifact_pointers,
sourceBaseDir,
outputDir,
row.source.index,
);
rewritten.artifact_pointers = artifactPointers;
if (
row.record.transcript_path &&
rewritten.transcript_path === row.record.transcript_path &&
artifactPointers?.transcript?.path
) {
rewritten.transcript_path = artifactPointers.transcript.path;
}
return rewritten as unknown as ResultManifestRecord;
}

Expand Down
98 changes: 95 additions & 3 deletions apps/cli/src/commands/results/export.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,22 @@
* - To add new per-test workspace files, add them under each test directory.
*/

import { readFileSync } from 'node:fs';
import path from 'node:path';

import { command, oneOf, option, optional, positional, string } from 'cmd-ts';
import { command, flag, oneOf, option, optional, positional, string } from 'cmd-ts';

import type { EvaluationResult, ExportDuplicatePolicy } from '@agentv/core';
import type { EvaluationResult, ExportDuplicatePolicy, IndexArtifactEntry } from '@agentv/core';

import { parseJsonlResults, writeArtifactsFromResults } from '../eval/artifact-writer.js';
import { RESULT_INDEX_FILENAME } from '../eval/result-layout.js';
import { loadManifestResults } from './manifest.js';
import {
type ProjectionBundle,
buildProjectionBundle,
serializeProjectionBundle,
writeProjectionBundle,
} from './projection-bundle.js';
import { loadResults as loadSharedResults, resolveSourceFile } from './shared.js';

// ── Export logic ─────────────────────────────────────────────────────────
Expand Down Expand Up @@ -92,6 +100,36 @@ export async function loadExportSource(
return { sourceFile, results };
}

function readIndexArtifactEntries(indexPath: string): IndexArtifactEntry[] {
return readFileSync(indexPath, 'utf8')
.split(/\r?\n/)
.map((line) => line.trim())
.filter(Boolean)
.map((line) => JSON.parse(line) as IndexArtifactEntry);
}

export function buildProjectionBundleFromExportedIndex(options: {
readonly sourceFile: string;
readonly outputDir: string;
readonly cwd?: string;
readonly includeRawContent?: boolean;
readonly duplicatePolicy?: ExportDuplicatePolicy;
}): ProjectionBundle {
const indexPath = path.join(options.outputDir, RESULT_INDEX_FILENAME);
const indexRecords = readIndexArtifactEntries(indexPath);
const emittedResults = loadManifestResults(indexPath);

return buildProjectionBundle(emittedResults, {
sourceFile: options.sourceFile,
runId: deriveExportRunId(options.sourceFile),
cwd: options.cwd,
duplicatePolicy: options.duplicatePolicy,
includeRawContent: options.includeRawContent,
artifactRefStatus: 'emitted',
indexRecords,
});
}

// ── CLI command ──────────────────────────────────────────────────────────

export const resultsExportCommand = command({
Expand Down Expand Up @@ -122,10 +160,34 @@ export const resultsExportCommand = command({
description:
'How to handle duplicate projection identities in the output: update (default), skip, or error',
}),
projectionBundle: flag({
long: 'projection-bundle',
description: 'Write a vendor-neutral projection_bundle.json alongside exported artifacts',
}),
dryRun: flag({
long: 'dry-run',
description: 'Print deterministic projection bundle JSON without writing export artifacts',
}),
includeRawContent: flag({
long: 'include-raw-content',
description:
'Include raw prompt, output, and tool payload content in the projection bundle (off by default)',
}),
},
handler: async ({ source, out, dir, duplicatePolicy }) => {
handler: async ({
source,
out,
dir,
duplicatePolicy,
projectionBundle,
dryRun,
includeRawContent,
}) => {
const cwd = dir ?? process.cwd();
const policy = (duplicatePolicy ?? 'update') as ExportDuplicatePolicy;
const shouldWriteProjectionBundle = projectionBundle;
const shouldDryRun = dryRun;
const shouldIncludeRawContent = includeRawContent;

try {
const { sourceFile, results } = await loadExportSource(source, cwd);
Expand All @@ -136,14 +198,44 @@ export const resultsExportCommand = command({
: path.resolve(cwd, out)
: deriveOutputDir(cwd, sourceFile);

const buildBundle = () =>
buildProjectionBundle(results, {
sourceFile,
runId: deriveExportRunId(sourceFile),
cwd,
duplicatePolicy: policy,
includeRawContent: shouldIncludeRawContent,
});

if (shouldDryRun) {
process.stdout.write(serializeProjectionBundle(buildBundle()));
return;
}

await writeArtifactsFromResults(results, outputDir, {
evalFile: sourceFile,
runId: deriveExportRunId(sourceFile),
duplicatePolicy: policy,
});

const bundlePath = shouldWriteProjectionBundle
? await writeProjectionBundle(
buildProjectionBundleFromExportedIndex({
sourceFile,
outputDir,
cwd,
duplicatePolicy: policy,
includeRawContent: shouldIncludeRawContent,
}),
outputDir,
)
: undefined;

// Report exported test IDs
console.log(`Exported ${results.length} test(s) to ${outputDir}`);
if (bundlePath) {
console.log(`Projection bundle written to ${bundlePath}`);
}
for (const result of results) {
console.log(` ${result.testId ?? 'unknown'}`);
}
Expand Down
Loading
Loading