From 27d164103995aea671631cfc0011479afd86f8b8 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Fri, 26 Jun 2026 18:53:23 -0500
Subject: [PATCH 1/4] feat(evals): add SWE-bench Lite accuracy eval
lm-eval cannot score SWE-bench (no repo-level Docker test executor), so this
reuses lm-eval for patch *generation* and adds a scoring step that runs the
official swebench harness, emitting an lm-eval-shaped results JSON so the
existing collect/validate pipeline works unchanged.
- swebench_lite.yaml: lm-eval generate_until task over SWE-bench Lite
- swebench_score.py: diff extraction -> predictions.jsonl -> swebench harness
-> resolved-rate -> lm-eval-shaped results; offline --report/--predictions-only
- benchmark_lib.sh: run_swebench_eval + --framework swebench dispatch
- collect_eval_results.py: recognize 'resolved' filter as the primary score
- thresholds.json: placeholder swebench_lite entry (needs calibration)
- EVALS.md: document the new framework + task
- test_swebench_eval.py: unit + integration tests
---
benchmarks/benchmark_lib.sh | 78 +++++++
utils/collect_eval_results.py | 4 +-
utils/evals/EVALS.md | 25 +++
utils/evals/swebench_lite.yaml | 56 +++++
utils/evals/swebench_score.py | 340 ++++++++++++++++++++++++++++++
utils/evals/test_swebench_eval.py | 182 ++++++++++++++++
utils/evals/thresholds.json | 3 +-
7 files changed, 686 insertions(+), 2 deletions(-)
create mode 100644 utils/evals/swebench_lite.yaml
create mode 100644 utils/evals/swebench_score.py
create mode 100644 utils/evals/test_swebench_eval.py
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 95e063a3d..70353e65d 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -966,6 +966,83 @@ META
echo "Moved eval artifacts to: $(pwd)"
}
+# ------------------------------
+# SWE-bench eval helpers
+# ------------------------------
+
+# Run the SWE-bench Lite eval: generate patches with lm-eval, then score them
+# with the official swebench Docker harness. lm-eval cannot score SWE-bench
+# itself (no repo-level test executor), so we reuse it only for generation and
+# emit an lm-eval-shaped results JSON from swebench_score.py so the rest of the
+# pipeline (append_lm_eval_summary / collect / validate) is unchanged.
+#
+# Env knobs:
+# SWEBENCH_DATASET (default princeton-nlp/SWE-bench_Lite)
+# SWEBENCH_TASK_NAME (default swebench_lite)
+# SWEBENCH_MAX_WORKERS (default 4) harness Docker workers
+# SWEBENCH_NAMESPACE pass "" on arm/Mac to build images locally
+# SWEBENCH_SKIP_SCORE "true" => generate + stage predictions only, no Docker
+# (for runners without Docker; score elsewhere)
+run_swebench_eval() {
+ local out_dir="${EVAL_RESULT_DIR:-$(mktemp -d /tmp/eval_out-XXXXXX)}"
+ local task_name="${SWEBENCH_TASK_NAME:-swebench_lite}"
+ local gen_dir
+ gen_dir=$(mktemp -d /tmp/swebench_gen-XXXXXX)
+
+ # 1. Generation via lm-eval (reuses endpoint wiring, _patch_lm_eval, etc.).
+ # run_lm_eval already passes --log_samples, which is what we consume.
+ local prev_tasks_dir="${EVAL_TASKS_DIR:-}"
+ export EVAL_TASKS_DIR="${EVAL_TASKS_DIR:-utils/evals/${task_name}.yaml}"
+ local gen_rc=0
+ run_lm_eval "$@" --results-dir "$gen_dir" || gen_rc=$?
+ export EVAL_TASKS_DIR="$prev_tasks_dir"
+ if [ "$gen_rc" -ne 0 ]; then
+ echo "ERROR: swebench generation (lm-eval) failed with $gen_rc" >&2
+ rm -rf "$gen_dir" 2>/dev/null || true
+ return "$gen_rc"
+ fi
+
+ # Preserve generations as artifacts alongside the scored results.
+ mkdir -p "$out_dir"
+ find "$gen_dir" -name 'samples_*.jsonl' -exec cp -f {} "$out_dir"/ \; 2>/dev/null || true
+ export EVAL_RESULT_DIR="$out_dir"
+
+ local lm_eval_version
+ lm_eval_version=$(python3 -c 'import lm_eval; print(lm_eval.__version__)' 2>/dev/null || echo unknown)
+
+ if [ "${SWEBENCH_SKIP_SCORE:-false}" = "true" ]; then
+ # Generation-only mode: emit predictions, defer Docker scoring elsewhere.
+ # TODO(alec): wire the separate scoring job (Modal / sb-cli / CPU runner).
+ local skip_rc=0
+ python3 utils/evals/swebench_score.py \
+ --samples-dir "$gen_dir" --out-dir "$out_dir" \
+ --model-name "${MODEL_NAME:-$MODEL}" --task-name "$task_name" \
+ --predictions-only || skip_rc=$?
+ echo "SWEBENCH_SKIP_SCORE=true: staged predictions only (no resolved-rate)." >&2
+ rm -rf "$gen_dir" 2>/dev/null || true
+ return "$skip_rc"
+ fi
+
+ # 2. Score with the official swebench harness (requires Docker) and emit the
+ # lm-eval-shaped results JSON into EVAL_RESULT_DIR.
+ local score_rc=0
+ python3 utils/evals/swebench_score.py \
+ --samples-dir "$gen_dir" \
+ --out-dir "$out_dir" \
+ --model-name "${MODEL_NAME:-$MODEL}" \
+ --task-name "$task_name" \
+ --dataset-name "${SWEBENCH_DATASET:-princeton-nlp/SWE-bench_Lite}" \
+ --max-workers "${SWEBENCH_MAX_WORKERS:-4}" \
+ --lm-eval-version "$lm_eval_version" \
+ ${SWEBENCH_NAMESPACE+--namespace "$SWEBENCH_NAMESPACE"} \
+ || score_rc=$?
+ rm -rf "$gen_dir" 2>/dev/null || true
+ if [ "$score_rc" -ne 0 ]; then
+ echo "ERROR: swebench scoring failed with $score_rc" >&2
+ return "$score_rc"
+ fi
+}
+
# ------------------------------
# Unified eval entrypoint
# ------------------------------
@@ -1052,6 +1129,7 @@ run_eval() {
local eval_rc=0
case "$framework" in
lm-eval|lm_eval) run_lm_eval "${forwarded[@]}" || eval_rc=$? ;;
+ swebench) run_swebench_eval "${forwarded[@]}" || eval_rc=$? ;;
*) echo "Unknown framework '${framework}'"; eval_rc=1 ;;
esac
diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py
index 194fa4acb..f98a6f7c4 100644
--- a/utils/collect_eval_results.py
+++ b/utils/collect_eval_results.py
@@ -141,7 +141,9 @@ def get_val_se(filter_name: str) -> Tuple[Optional[float], Optional[float]]:
# Extract metrics for each filter
for f in filter_list:
fname = f['name']
- if 'strict' in fname:
+ # 'resolved' is SWE-bench's resolved-rate (swebench_score.py);
+ # treat it as the primary/strict score so it populates `score`.
+ if 'strict' in fname or 'resolved' in fname:
strict_val, strict_se = get_val_se(fname)
elif 'flex' in fname or 'extract' in fname:
flex_val, flex_se = get_val_se(fname)
diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md
index 7ff878dce..752e13131 100644
--- a/utils/evals/EVALS.md
+++ b/utils/evals/EVALS.md
@@ -169,7 +169,32 @@ The codebase patches lm-eval compatibility via `_patch_lm_eval`:
1. Reasoning token handling: extracts `reasoning_content` when `message.content` is empty.
2. TRT compatibility: avoids injecting `{"type": "text"}` for non-HF tokenizers.
+### SWE-bench Lite (`--framework swebench`)
+
+SWE-bench is **not** a `generate_until` QA task — it requires applying the model's
+patch to a repo and running tests in Docker, which lm-eval cannot do. So it runs
+through a dedicated framework that reuses lm-eval for *generation* only, then scores
+with the official `swebench` harness and emits an lm-eval-shaped results JSON
+(metric `exact_match,resolved` = resolved-rate) so collect/validate work unchanged.
+
+```bash
+run_eval --framework swebench --port "$PORT" # generation (lm-eval) -> scoring (swebench)
+append_lm_eval_summary
+```
+
+- Task: `utils/evals/swebench_lite.yaml` (generation) — SWE-bench Lite, the ~300-instance curated
+ quick-eval subset (no difficulty filter needed; Lite is already the lightweight set).
+- Scoring: `utils/evals/swebench_score.py` (diff extraction → `predictions.jsonl` →
+ `python -m swebench.harness.run_evaluation` → resolved-rate → results JSON). Offline
+ `--report` mode skips Docker for testing.
+- Knobs: `SWEBENCH_DATASET`, `SWEBENCH_TASK_NAME`, `SWEBENCH_MAX_WORKERS`,
+ `SWEBENCH_NAMESPACE` (pass `""` on arm/Mac), `SWEBENCH_SKIP_SCORE=true` (generate-only).
+- **Requires Docker + ~120 GB disk on the scoring host.** This is an MVP; the single-shot prompt and
+ diff extraction still need tuning to reach published resolved-rates, and the `thresholds.json` entry
+ needs calibration from a baseline run.
+
## Task files
The following files are task definitions from lm-eval; more information on changes lives within the files:
- `utils/evals/gsm8k.yaml`
- `utils/evals/gpqa_diamond.yaml`
+- `utils/evals/swebench_lite.yaml` (generation only; scored by `swebench_score.py`)
diff --git a/utils/evals/swebench_lite.yaml b/utils/evals/swebench_lite.yaml
new file mode 100644
index 000000000..4633af462
--- /dev/null
+++ b/utils/evals/swebench_lite.yaml
@@ -0,0 +1,56 @@
+# SWE-bench Lite -- GENERATION ONLY.
+#
+# Lite is the ~300-instance curated subset for quick evals (no difficulty labels;
+# it's already the lightweight set, so no filtering is needed -- unlike Verified,
+# which carries a `difficulty` field).
+#
+# lm-eval is used purely to drive the served OpenAI-compatible endpoint and dump
+# one candidate patch per instance via --log_samples. The metric below is a
+# PLACEHOLDER that lm-eval computes but we ignore: the real resolved-rate comes
+# from utils/evals/swebench_score.py running the official `swebench` harness,
+# which then emits an lm-eval-shaped results JSON for collect/validate.
+#
+# Run it through the dedicated framework, not bare lm-eval:
+# run_eval --framework swebench --port "$PORT"
+# which wires generation -> scoring. Bare `--tasks swebench_lite.yaml` would
+# produce only the meaningless placeholder metric.
+task: swebench_lite
+dataset_path: princeton-nlp/SWE-bench_Lite # also mirrored at SWE-bench/SWE-bench_Lite
+output_type: generate_until
+test_split: test
+
+doc_to_text: |
+ You are an expert software engineer fixing a real GitHub issue in the
+ repository `{{repo}}` at commit {{base_commit}}.
+
+
+ {{problem_statement}}
+
+
+ Respond with ONLY a unified diff (a git patch) that resolves the issue, using
+ real repository file paths. Do not include explanations. Wrap the patch in a
+ single fenced block exactly like:
+
+ ```diff
+ diff --git a/path/to/file.py b/path/to/file.py
+ --- a/path/to/file.py
+ +++ b/path/to/file.py
+ @@ ... @@
+ ```
+# The gold patch is the nominal target. lm-eval's exact_match against it is
+# meaningless for patches (overwritten by the harness score); it only exists so
+# generate_until has a target + a metric and does not error.
+doc_to_target: "{{patch}}"
+
+generation_kwargs:
+ until: []
+ do_sample: false
+ temperature: 0.0
+
+metric_list:
+ - metric: exact_match
+ aggregation: mean
+ higher_is_better: true
+
+metadata:
+ version: 0.1
diff --git a/utils/evals/swebench_score.py b/utils/evals/swebench_score.py
new file mode 100644
index 000000000..edf8ef212
--- /dev/null
+++ b/utils/evals/swebench_score.py
@@ -0,0 +1,340 @@
+#!/usr/bin/env python3
+"""Score SWE-bench patches generated by lm-eval and emit an lm-eval-shaped result.
+
+Pipeline:
+
+ 1. Read lm-eval ``--log_samples`` output (samples_*.jsonl): one candidate per
+ SWE-bench instance.
+ 2. Extract a unified diff from each model generation.
+ 3. Write a ``predictions.jsonl`` in the format the official ``swebench`` harness
+ expects: ``{instance_id, model_name_or_path, model_patch}``.
+ 4. Run ``python -m swebench.harness.run_evaluation`` (Docker) to get the
+ resolved-rate -- unless ``--no-run``/``--report`` is given (offline/testing).
+ 5. Emit a results JSON shaped like an lm-eval result so the existing
+ ``collect_eval_results.py`` / ``validate_scores.py`` ingest it unchanged.
+ The metric is published as ``exact_match,resolved`` = resolved-rate.
+
+The harness needs Docker + lots of disk and is NOT runnable on this dev Mac, so
+the Docker step is isolated behind ``--no-run`` for local testing. TODO(alec):
+exercise the real ``--run`` path on a runner.
+"""
+
+import argparse
+import json
+import math
+import re
+import subprocess
+import sys
+from pathlib import Path
+from typing import Any, Iterator, Optional
+
+DEFAULT_DATASET = "princeton-nlp/SWE-bench_Lite"
+DEFAULT_TASK = "swebench_lite"
+
+# A unified diff, optionally inside a ```diff / ```patch fence. We try fenced
+# first (what the prompt asks for), then a bare ``diff --git`` slice.
+_FENCED_DIFF_RE = re.compile(
+ r"```(?:diff|patch)?\s*\n(?P
.*?)```",
+ re.DOTALL | re.IGNORECASE,
+)
+_DIFF_GIT_RE = re.compile(r"(?:^|\n)(diff --git .*)", re.DOTALL)
+
+
+def extract_patch(text: str) -> str:
+ """Pull a unified diff out of a model generation.
+
+ Best-effort and deliberately conservative -- a wrong extraction just means
+ that instance is unresolved, never a crash. Diff-extraction quality is a
+ primary tuning lever (TODO(alec)): bad fences here directly suppress the
+ resolved-rate.
+ """
+ if not text:
+ return ""
+ # 1. Prefer a fenced block that actually looks like a diff.
+ for match in _FENCED_DIFF_RE.finditer(text):
+ body = match.group("body")
+ if "diff --git" in body or body.lstrip().startswith(("--- ", "+++ ")):
+ return body.strip("\n") + "\n"
+ # 2. Fall back to the first ``diff --git`` to end-of-text.
+ git_match = _DIFF_GIT_RE.search(text)
+ if git_match:
+ return git_match.group(1).strip("\n") + "\n"
+ # 3. Last resort: a lone fenced block, or the raw text.
+ lone = _FENCED_DIFF_RE.search(text)
+ if lone:
+ return lone.group("body").strip("\n") + "\n"
+ return text.strip("\n") + "\n" if text.strip() else ""
+
+
+def _response_text(record: dict) -> str:
+ """Extract the model's text from one lm-eval sample record.
+
+ lm-eval's sample schema has drifted across versions; be tolerant.
+ TODO(alec): confirm against the pinned harness's real samples_*.jsonl.
+ """
+ for key in ("filtered_resps", "resps"):
+ val = record.get(key)
+ while isinstance(val, (list, tuple)) and val:
+ val = val[0]
+ if isinstance(val, str) and val.strip():
+ return val
+ return ""
+
+
+def _instance_id(record: dict) -> Optional[str]:
+ doc = record.get("doc")
+ if isinstance(doc, dict):
+ for key in ("instance_id", "instance", "id"):
+ val = doc.get(key)
+ if isinstance(val, str) and val:
+ return val
+ # Some versions hoist doc fields to the top level.
+ val = record.get("instance_id")
+ return val if isinstance(val, str) and val else None
+
+
+def iter_samples(samples_dir: Path) -> Iterator[dict]:
+ """Yield JSON records from every samples_*.jsonl under ``samples_dir``."""
+ files = sorted(samples_dir.rglob("samples_*.jsonl"))
+ if not files:
+ raise FileNotFoundError(
+ f"no samples_*.jsonl found under {samples_dir} -- did lm-eval run "
+ "with --log_samples?"
+ )
+ for path in files:
+ with path.open() as fh:
+ for line in fh:
+ line = line.strip()
+ if line:
+ yield json.loads(line)
+
+
+def build_predictions(samples_dir: Path, model_name: str) -> list[dict]:
+ """Turn lm-eval samples into swebench prediction rows (dedup by instance)."""
+ by_instance: dict[str, dict] = {}
+ skipped = 0
+ for record in iter_samples(samples_dir):
+ instance_id = _instance_id(record)
+ if not instance_id:
+ skipped += 1
+ continue
+ patch = extract_patch(_response_text(record))
+ # Last write wins; SWE-bench is single-attempt so there should be one
+ # record per instance anyway.
+ by_instance[instance_id] = {
+ "instance_id": instance_id,
+ "model_name_or_path": model_name,
+ "model_patch": patch,
+ }
+ if skipped:
+ print(f"WARN: skipped {skipped} sample(s) with no instance_id", file=sys.stderr)
+ if not by_instance:
+ raise ValueError("no usable predictions extracted from samples")
+ return list(by_instance.values())
+
+
+def write_predictions(predictions: list[dict], out_path: Path) -> None:
+ with out_path.open("w") as fh:
+ for row in predictions:
+ fh.write(json.dumps(row) + "\n")
+
+
+def run_harness(
+ predictions_path: Path,
+ dataset_name: str,
+ run_id: str,
+ work_dir: Path,
+ max_workers: int,
+ namespace: Optional[str],
+) -> None:
+ """Invoke the official swebench Docker harness (requires Docker)."""
+ cmd = [
+ sys.executable, "-m", "swebench.harness.run_evaluation",
+ "--dataset_name", dataset_name,
+ "--predictions_path", str(predictions_path),
+ "--run_id", run_id,
+ "--max_workers", str(max_workers),
+ ]
+ if namespace is not None:
+ # On arm/Mac (and to force local image builds) pass --namespace ''.
+ cmd += ["--namespace", namespace]
+ print(f"[swebench] running: {' '.join(cmd)}", flush=True)
+ subprocess.run(cmd, cwd=str(work_dir), check=True)
+
+
+def find_report(work_dir: Path, model_name: str, run_id: str) -> Path:
+ """Locate the harness report JSON, tolerant to known layout variants."""
+ sanitized = model_name.replace("/", "__")
+ candidates = [
+ work_dir / f"{sanitized}.{run_id}.json", # classic: ..json
+ work_dir / f"{model_name}.{run_id}.json",
+ work_dir / "evaluation_results" / "results.json", # newer layout
+ ]
+ for path in candidates:
+ if path.exists():
+ return path
+ # Broad fallback: any *.json mentioning resolved/total at the top level.
+ for path in sorted(work_dir.rglob("*.json")):
+ try:
+ data = json.loads(path.read_text())
+ except (json.JSONDecodeError, OSError):
+ continue
+ if isinstance(data, dict) and (
+ "resolved_instances" in data or "resolved_ids" in data
+ ):
+ return path
+ raise FileNotFoundError(
+ f"could not locate a swebench report under {work_dir} "
+ f"(looked for {[str(c) for c in candidates]})"
+ )
+
+
+def parse_resolved(report: dict) -> tuple[int, int]:
+ """Return (resolved, total) from a harness report, tolerant to key variants.
+
+ Denominator is the full instance count (leaderboard convention:
+ resolved / total), not just completed instances.
+ """
+ resolved: Optional[int] = None
+ for key in ("resolved_instances", "resolved", "num_resolved"):
+ if isinstance(report.get(key), int):
+ resolved = report[key]
+ break
+ if resolved is None and isinstance(report.get("resolved_ids"), list):
+ resolved = len(report["resolved_ids"])
+
+ total: Optional[int] = None
+ for key in ("total_instances", "completed_instances", "submitted_instances"):
+ val = report.get(key)
+ if isinstance(val, int) and val > 0:
+ total = val
+ break
+ if total is None:
+ for key in ("completed_ids", "submitted_ids"):
+ if isinstance(report.get(key), list) and report[key]:
+ total = len(report[key])
+ break
+
+ if resolved is None or total is None or total <= 0:
+ raise ValueError(
+ f"could not parse resolved/total from report keys {sorted(report)}"
+ )
+ return resolved, total
+
+
+def build_results_json(
+ task: str,
+ resolved: int,
+ total: int,
+ model_name: str,
+ lm_eval_version: str,
+ report: Optional[dict],
+) -> dict:
+ """Shape the resolved-rate as an lm-eval result.
+
+ Published as ``exact_match,resolved`` so validate_scores (prefix
+ ``exact_match,``) gates it and collect_eval_results surfaces it as ``score``.
+ """
+ rate = resolved / total
+ stderr = math.sqrt(rate * (1.0 - rate) / total) if total else 0.0
+ return {
+ "lm_eval_version": lm_eval_version,
+ "model_name": model_name,
+ "results": {
+ task: {
+ "alias": task,
+ "exact_match,resolved": rate,
+ "exact_match_stderr,resolved": stderr,
+ }
+ },
+ "configs": {
+ task: {
+ "metric_list": [{"metric": "exact_match"}],
+ "filter_list": [{"name": "resolved"}],
+ }
+ },
+ "n-samples": {task: {"effective": total, "original": total}},
+ # Debugging passthrough; ignored by collectors (no lm_eval_version here).
+ "swebench": {
+ "resolved": resolved,
+ "total": total,
+ "resolved_rate": rate,
+ "report": report,
+ },
+ }
+
+
+def main(argv: Optional[list[str]] = None) -> int:
+ parser = argparse.ArgumentParser(description="Score SWE-bench patches from lm-eval samples")
+ parser.add_argument("--samples-dir", required=True, help="dir containing lm-eval samples_*.jsonl")
+ parser.add_argument("--out-dir", required=True, help="dir to write predictions + results JSON")
+ parser.add_argument("--model-name", required=True, help="served model name (model_name_or_path)")
+ parser.add_argument("--dataset-name", default=DEFAULT_DATASET)
+ parser.add_argument("--task-name", default=DEFAULT_TASK)
+ parser.add_argument("--run-id", default=None, help="harness run id (default: task name)")
+ parser.add_argument("--max-workers", type=int, default=4)
+ parser.add_argument(
+ "--namespace", default=None,
+ help="swebench --namespace value (pass '' on arm/Mac to build images locally)",
+ )
+ parser.add_argument("--lm-eval-version", default="unknown")
+ parser.add_argument(
+ "--predictions-only", action="store_true",
+ help="write predictions.jsonl and stop (no scoring; score elsewhere)",
+ )
+ parser.add_argument(
+ "--no-run", action="store_true",
+ help="skip the Docker harness; requires --report (offline/testing)",
+ )
+ parser.add_argument(
+ "--report", default=None,
+ help="path to a pre-computed harness report JSON (implies --no-run)",
+ )
+ args = parser.parse_args(argv)
+
+ samples_dir = Path(args.samples_dir)
+ out_dir = Path(args.out_dir)
+ out_dir.mkdir(parents=True, exist_ok=True)
+ run_id = args.run_id or args.task_name
+
+ # 1-3. samples -> predictions.jsonl
+ predictions = build_predictions(samples_dir, args.model_name)
+ predictions_path = out_dir / "predictions.jsonl"
+ write_predictions(predictions, predictions_path)
+ print(f"[swebench] wrote {len(predictions)} predictions -> {predictions_path}")
+
+ if args.predictions_only:
+ print("[swebench] predictions-only: skipping scoring (score elsewhere)")
+ return 0
+
+ # 4. score (Docker) or load an existing report
+ if args.report:
+ report = json.loads(Path(args.report).read_text())
+ elif args.no_run:
+ print("ERROR: --no-run requires --report", file=sys.stderr)
+ return 1
+ else:
+ run_harness(
+ predictions_path, args.dataset_name, run_id,
+ out_dir, args.max_workers, args.namespace,
+ )
+ report = json.loads(find_report(out_dir, args.model_name, run_id).read_text())
+
+ resolved, total = parse_resolved(report)
+
+ # 5. emit lm-eval-shaped results
+ results = build_results_json(
+ args.task_name, resolved, total, args.model_name,
+ args.lm_eval_version, report,
+ )
+ results_path = out_dir / f"results_{args.task_name}.json"
+ results_path.write_text(json.dumps(results, indent=2))
+ print(
+ f"[swebench] {args.task_name}: resolved {resolved}/{total} "
+ f"= {resolved / total:.4f} -> {results_path}"
+ )
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/utils/evals/test_swebench_eval.py b/utils/evals/test_swebench_eval.py
new file mode 100644
index 000000000..8e59975ca
--- /dev/null
+++ b/utils/evals/test_swebench_eval.py
@@ -0,0 +1,182 @@
+"""Tests for the SWE-bench Lite eval MVP (generation -> scoring -> lm-eval shape).
+
+Pure-stdlib paths (extract_patch, predictions, report parsing, results shape)
+run on any interpreter. The dataset filter and the collect/validate integration
+guard on optional deps / interpreter version so the file imports cleanly even on
+the macOS system python 3.9 used for local spot-checks.
+"""
+
+import json
+import sys
+from pathlib import Path
+
+import pytest
+
+sys.path.insert(0, str(Path(__file__).resolve().parent)) # utils/evals
+sys.path.insert(0, str(Path(__file__).resolve().parents[1])) # utils
+
+import swebench_score as sbs
+
+
+# --- diff extraction -------------------------------------------------------
+
+def test_extract_patch_from_diff_fence():
+ text = (
+ "Here is the fix:\n\n```diff\n"
+ "diff --git a/f.py b/f.py\n--- a/f.py\n+++ b/f.py\n"
+ "@@ -1 +1 @@\n-old\n+new\n```\nDone."
+ )
+ patch = sbs.extract_patch(text)
+ assert patch.startswith("diff --git a/f.py b/f.py")
+ assert patch.endswith("\n")
+ assert "Here is the fix" not in patch
+ assert "Done." not in patch
+
+
+def test_extract_patch_bare_diff_git():
+ text = "no fence\ndiff --git a/x b/x\n@@ @@\n-a\n+b\n"
+ patch = sbs.extract_patch(text)
+ assert patch.startswith("diff --git a/x b/x")
+ assert "no fence" not in patch
+
+
+def test_extract_patch_empty_when_no_diff():
+ assert sbs.extract_patch("") == ""
+ # Prose with no diff markers falls back to the raw text (harness will reject).
+ assert sbs.extract_patch("just words").strip() == "just words"
+
+
+# --- samples -> predictions ------------------------------------------------
+
+def _write_samples(dirpath: Path, records: list[dict]) -> None:
+ with (dirpath / "samples_swebench_lite_2026.jsonl").open("w") as fh:
+ for rec in records:
+ fh.write(json.dumps(rec) + "\n")
+
+
+def test_build_predictions_extracts_instance_and_patch(tmp_path):
+ _write_samples(tmp_path, [
+ {
+ "doc": {"instance_id": "repo__proj-1"},
+ "filtered_resps": ["```diff\ndiff --git a/a b/a\n+x\n```"],
+ },
+ {
+ "doc": {"instance_id": "repo__proj-2"},
+ "resps": [["diff --git a/b b/b\n+y\n"]],
+ },
+ ])
+ preds = sbs.build_predictions(tmp_path, "my-model")
+ by_id = {p["instance_id"]: p for p in preds}
+ assert set(by_id) == {"repo__proj-1", "repo__proj-2"}
+ assert by_id["repo__proj-1"]["model_name_or_path"] == "my-model"
+ assert by_id["repo__proj-1"]["model_patch"].startswith("diff --git a/a b/a")
+ assert by_id["repo__proj-2"]["model_patch"].startswith("diff --git a/b b/b")
+
+
+def test_build_predictions_raises_without_samples(tmp_path):
+ with pytest.raises(FileNotFoundError):
+ sbs.build_predictions(tmp_path, "m")
+
+
+# --- report parsing --------------------------------------------------------
+
+def test_parse_resolved_classic_counts():
+ assert sbs.parse_resolved(
+ {"resolved_instances": 80, "total_instances": 196}
+ ) == (80, 196)
+
+
+def test_parse_resolved_from_id_lists():
+ report = {"resolved_ids": ["a", "b", "c"], "completed_ids": ["a", "b", "c", "d"]}
+ # no total_instances -> falls back to completed_ids length
+ assert sbs.parse_resolved(report) == (3, 4)
+
+
+def test_parse_resolved_raises_on_garbage():
+ with pytest.raises(ValueError):
+ sbs.parse_resolved({"nope": 1})
+
+
+# --- lm-eval-shaped results ------------------------------------------------
+
+def test_build_results_json_is_lm_eval_shaped():
+ res = sbs.build_results_json(
+ "swebench_lite", 49, 196, "m", "0.4.12", {"resolved_instances": 49}
+ )
+ assert "lm_eval_version" in res # detection key for collect_eval_results
+ task = res["results"]["swebench_lite"]
+ assert task["exact_match,resolved"] == pytest.approx(0.25)
+ cfg = res["configs"]["swebench_lite"]
+ assert cfg["filter_list"] == [{"name": "resolved"}]
+ assert res["n-samples"]["swebench_lite"]["effective"] == 196
+
+
+def test_score_offline_end_to_end(tmp_path):
+ """--report path: samples -> predictions + results JSON, no Docker."""
+ samples = tmp_path / "gen"
+ samples.mkdir()
+ _write_samples(samples, [
+ {"doc": {"instance_id": "r__p-1"}, "filtered_resps": ["```diff\ndiff --git a/a b/a\n+x\n```"]},
+ ])
+ report = tmp_path / "report.json"
+ report.write_text(json.dumps({"resolved_instances": 1, "total_instances": 1}))
+ out = tmp_path / "out"
+ rc = sbs.main([
+ "--samples-dir", str(samples), "--out-dir", str(out),
+ "--model-name", "m", "--report", str(report),
+ ])
+ assert rc == 0
+ assert (out / "predictions.jsonl").exists()
+ results = json.loads((out / "results_swebench_lite.json").read_text())
+ assert results["results"]["swebench_lite"]["exact_match,resolved"] == 1.0
+
+
+def test_predictions_only_writes_predictions_no_results(tmp_path):
+ """SWEBENCH_SKIP_SCORE path: predictions only, no Docker, no results JSON."""
+ samples = tmp_path / "gen"
+ samples.mkdir()
+ _write_samples(samples, [
+ {"doc": {"instance_id": "r__p-1"}, "filtered_resps": ["```diff\ndiff --git a/a b/a\n+x\n```"]},
+ ])
+ out = tmp_path / "out"
+ rc = sbs.main([
+ "--samples-dir", str(samples), "--out-dir", str(out),
+ "--model-name", "m", "--predictions-only",
+ ])
+ assert rc == 0
+ assert (out / "predictions.jsonl").exists()
+ assert not (out / "results_swebench_lite.json").exists()
+
+
+# --- integration with the existing pipeline (needs tabulate + py3.10+) -----
+
+@pytest.mark.skipif(sys.version_info < (3, 10), reason="repo modules use py3.10 syntax")
+def test_results_json_flows_through_collect_and_validate(tmp_path, monkeypatch):
+ pytest.importorskip("tabulate")
+ import collect_eval_results as cer
+ import validate_scores as vs
+
+ art = tmp_path / "eval"
+ art.mkdir()
+ (art / "meta_env.json").write_text(json.dumps({
+ "infmax_model_prefix": "dsr1", "hw": "b200", "framework": "sglang",
+ "precision": "fp8", "isl": 8192, "osl": 1024,
+ }))
+ res = sbs.build_results_json(
+ "swebench_lite", 150, 300, "dsr1", "0.4.12", None
+ )
+ (art / "results_swebench_lite.json").write_text(json.dumps(res))
+
+ # collect surfaces the resolved-rate as the unified `score`.
+ rows = cer.collect_eval_rows(tmp_path)
+ assert len(rows) == 1
+ assert rows[0]["task"] == "swebench_lite"
+ assert rows[0]["score"] == pytest.approx(0.5)
+
+ # validate_scores gates exact_match,resolved against thresholds.json (0.10).
+ monkeypatch.chdir(art)
+ monkeypatch.setattr(sys, "argv", [
+ "validate_scores.py",
+ "--results-glob", "results_swebench_lite.json",
+ ])
+ assert vs.main() == 0 # 0.5 >= 0.10 default threshold
diff --git a/utils/evals/thresholds.json b/utils/evals/thresholds.json
index d6c091152..cbbe65105 100644
--- a/utils/evals/thresholds.json
+++ b/utils/evals/thresholds.json
@@ -1,7 +1,8 @@
{
"default": {
"gsm8k": 0.90,
- "gpqa_diamond_cot_n_shot": 0.30
+ "gpqa_diamond_cot_n_shot": 0.30,
+ "swebench_lite": 0.10
},
"models": {
"dsr1": {
From 4886287d3daec61a1f4da3bd9dcdbdb5f014a23d Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Fri, 26 Jun 2026 19:02:58 -0500
Subject: [PATCH 2/4] fix(evals): remove unused Any import in swebench_score.py
Addresses CodeQL (github-code-quality) finding on the PR.
---
utils/evals/swebench_score.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/utils/evals/swebench_score.py b/utils/evals/swebench_score.py
index edf8ef212..371260443 100644
--- a/utils/evals/swebench_score.py
+++ b/utils/evals/swebench_score.py
@@ -26,7 +26,7 @@
import subprocess
import sys
from pathlib import Path
-from typing import Any, Iterator, Optional
+from typing import Iterator, Optional
DEFAULT_DATASET = "princeton-nlp/SWE-bench_Lite"
DEFAULT_TASK = "swebench_lite"
From 6d5972abe8ac693a4a7fa845c6035301958b1bd7 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Fri, 26 Jun 2026 19:15:11 -0500
Subject: [PATCH 3/4] fix(evals): address PR review (claude bot) findings on
swebench
- swebench_score.py: bound the bare-diff fallback to the diff body so trailing
prose after a patch can't be glued on (would fail git apply -> unresolved,
suppressing resolved-rate). Add _trim_to_diff_body + regression tests.
- benchmark_lib.sh: derive the scoring dataset from the generation YAML's
dataset_path so generation/scoring can't diverge; SWEBENCH_DATASET (if set)
must match or it fails fast. Update docstring + EVALS.md.
CodeQL unused-import (Any) already fixed in 4886287d.
---
benchmarks/benchmark_lib.sh | 29 +++++++++++++---
utils/evals/EVALS.md | 6 ++--
utils/evals/swebench_score.py | 57 ++++++++++++++++++++++++++++---
utils/evals/test_swebench_eval.py | 27 +++++++++++++++
4 files changed, 108 insertions(+), 11 deletions(-)
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 70353e65d..fb9d35f64 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -977,8 +977,10 @@ META
# pipeline (append_lm_eval_summary / collect / validate) is unchanged.
#
# Env knobs:
-# SWEBENCH_DATASET (default princeton-nlp/SWE-bench_Lite)
-# SWEBENCH_TASK_NAME (default swebench_lite)
+# SWEBENCH_TASK_NAME (default swebench_lite) selects utils/evals/.yaml
+# SWEBENCH_DATASET optional; must equal the YAML's dataset_path (the
+# scoring dataset is derived from the YAML so generation
+# and scoring never diverge) -- mismatch fails fast
# SWEBENCH_MAX_WORKERS (default 4) harness Docker workers
# SWEBENCH_NAMESPACE pass "" on arm/Mac to build images locally
# SWEBENCH_SKIP_SCORE "true" => generate + stage predictions only, no Docker
@@ -989,10 +991,29 @@ run_swebench_eval() {
local gen_dir
gen_dir=$(mktemp -d /tmp/swebench_gen-XXXXXX)
+ # Keep the scoring dataset in lockstep with the generation YAML: the harness
+ # must score against the same instance set lm-eval generated patches for, or
+ # the instance IDs won't match. Derive it from the task YAML; if
+ # SWEBENCH_DATASET is set it must agree (fail-fast rather than mis-score).
+ local yaml_path="${EVAL_TASKS_DIR:-utils/evals/${task_name}.yaml}"
+ local dataset
+ dataset=$(awk '/^dataset_path:[[:space:]]/{print $2; exit}' "$yaml_path" 2>/dev/null)
+ if [ -z "$dataset" ]; then
+ echo "ERROR: could not read dataset_path from ${yaml_path}" >&2
+ rm -rf "$gen_dir" 2>/dev/null || true
+ return 1
+ fi
+ if [ -n "${SWEBENCH_DATASET:-}" ] && [ "${SWEBENCH_DATASET}" != "$dataset" ]; then
+ echo "ERROR: SWEBENCH_DATASET='${SWEBENCH_DATASET}' disagrees with ${yaml_path} dataset_path='${dataset}'." >&2
+ echo " Generation and scoring must use the same dataset; edit the YAML or unset SWEBENCH_DATASET." >&2
+ rm -rf "$gen_dir" 2>/dev/null || true
+ return 1
+ fi
+
# 1. Generation via lm-eval (reuses endpoint wiring, _patch_lm_eval, etc.).
# run_lm_eval already passes --log_samples, which is what we consume.
local prev_tasks_dir="${EVAL_TASKS_DIR:-}"
- export EVAL_TASKS_DIR="${EVAL_TASKS_DIR:-utils/evals/${task_name}.yaml}"
+ export EVAL_TASKS_DIR="$yaml_path"
local gen_rc=0
run_lm_eval "$@" --results-dir "$gen_dir" || gen_rc=$?
export EVAL_TASKS_DIR="$prev_tasks_dir"
@@ -1031,7 +1052,7 @@ run_swebench_eval() {
--out-dir "$out_dir" \
--model-name "${MODEL_NAME:-$MODEL}" \
--task-name "$task_name" \
- --dataset-name "${SWEBENCH_DATASET:-princeton-nlp/SWE-bench_Lite}" \
+ --dataset-name "$dataset" \
--max-workers "${SWEBENCH_MAX_WORKERS:-4}" \
--lm-eval-version "$lm_eval_version" \
${SWEBENCH_NAMESPACE+--namespace "$SWEBENCH_NAMESPACE"} \
diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md
index 752e13131..a7738defc 100644
--- a/utils/evals/EVALS.md
+++ b/utils/evals/EVALS.md
@@ -187,8 +187,10 @@ append_lm_eval_summary
- Scoring: `utils/evals/swebench_score.py` (diff extraction → `predictions.jsonl` →
`python -m swebench.harness.run_evaluation` → resolved-rate → results JSON). Offline
`--report` mode skips Docker for testing.
-- Knobs: `SWEBENCH_DATASET`, `SWEBENCH_TASK_NAME`, `SWEBENCH_MAX_WORKERS`,
- `SWEBENCH_NAMESPACE` (pass `""` on arm/Mac), `SWEBENCH_SKIP_SCORE=true` (generate-only).
+- Knobs: `SWEBENCH_TASK_NAME` (selects the YAML), `SWEBENCH_MAX_WORKERS`,
+ `SWEBENCH_NAMESPACE` (pass `""` on arm/Mac), `SWEBENCH_SKIP_SCORE=true` (generate-only). The
+ scoring dataset is derived from the YAML's `dataset_path` so generation and scoring can't diverge;
+ `SWEBENCH_DATASET`, if set, must match it (mismatch fails fast).
- **Requires Docker + ~120 GB disk on the scoring host.** This is an MVP; the single-shot prompt and
diff extraction still need tuning to reach published resolved-rates, and the `thresholds.json` entry
needs calibration from a baseline run.
diff --git a/utils/evals/swebench_score.py b/utils/evals/swebench_score.py
index 371260443..c1e511ed0 100644
--- a/utils/evals/swebench_score.py
+++ b/utils/evals/swebench_score.py
@@ -39,6 +39,44 @@
)
_DIFF_GIT_RE = re.compile(r"(?:^|\n)(diff --git .*)", re.DOTALL)
+# Line prefixes that belong to a (git) unified-diff body. Anything else marks
+# the end of the patch.
+_DIFF_LINE_PREFIXES = (
+ "diff ", "index ", "--- ", "+++ ", "@@", "+", "-", " ", "\\",
+ "old mode ", "new mode ", "new file mode ", "deleted file mode ",
+ "rename ", "copy ", "similarity ", "dissimilarity ",
+ "Binary files ", "GIT binary patch",
+)
+
+
+def _trim_to_diff_body(text: str) -> str:
+ """Keep only the leading run of diff-shaped lines, dropping trailing prose.
+
+ Models frequently emit a bare patch followed by an explanation ("Notes:",
+ "This fixes #123."). With no terminator that tail gets glued onto the patch
+ and rejected by ``git apply``, scoring the instance unresolved. Blank lines
+ are kept only when the diff resumes after them; a blank line followed by
+ non-diff text ends the patch.
+ """
+ lines = text.splitlines()
+ out: list[str] = []
+ i, n = 0, len(lines)
+ while i < n:
+ if lines[i].startswith(_DIFF_LINE_PREFIXES):
+ out.append(lines[i])
+ i += 1
+ continue
+ if lines[i] == "":
+ j = i
+ while j < n and lines[j] == "":
+ j += 1
+ if j < n and lines[j].startswith(_DIFF_LINE_PREFIXES):
+ out.extend(lines[i:j]) # interior blank line(s); diff resumes
+ i = j
+ continue
+ break # trailing blank(s)+prose, or any other non-diff line
+ return "\n".join(out)
+
def extract_patch(text: str) -> str:
"""Pull a unified diff out of a model generation.
@@ -50,19 +88,28 @@ def extract_patch(text: str) -> str:
"""
if not text:
return ""
+
+ def _finish(body: str) -> str:
+ body = _trim_to_diff_body(body).strip("\n")
+ return body + "\n" if body else ""
+
# 1. Prefer a fenced block that actually looks like a diff.
for match in _FENCED_DIFF_RE.finditer(text):
body = match.group("body")
if "diff --git" in body or body.lstrip().startswith(("--- ", "+++ ")):
- return body.strip("\n") + "\n"
- # 2. Fall back to the first ``diff --git`` to end-of-text.
+ return _finish(body)
+ # 2. Fall back to a bare ``diff --git``, trimmed to the diff body so
+ # trailing prose can't corrupt the patch.
git_match = _DIFF_GIT_RE.search(text)
if git_match:
- return git_match.group(1).strip("\n") + "\n"
- # 3. Last resort: a lone fenced block, or the raw text.
+ trimmed = _finish(git_match.group(1))
+ if trimmed:
+ return trimmed
+ # 3. Last resort: a lone fenced block (fence-bounded), or the raw text.
lone = _FENCED_DIFF_RE.search(text)
if lone:
- return lone.group("body").strip("\n") + "\n"
+ body = lone.group("body").strip("\n")
+ return body + "\n" if body else ""
return text.strip("\n") + "\n" if text.strip() else ""
diff --git a/utils/evals/test_swebench_eval.py b/utils/evals/test_swebench_eval.py
index 8e59975ca..067742759 100644
--- a/utils/evals/test_swebench_eval.py
+++ b/utils/evals/test_swebench_eval.py
@@ -40,6 +40,33 @@ def test_extract_patch_bare_diff_git():
assert "no fence" not in patch
+def test_extract_patch_bare_diff_strips_trailing_prose():
+ # A bare diff followed by an explanation must not glue the prose onto the
+ # patch (git apply would reject it -> instance scored unresolved).
+ text = (
+ "diff --git a/x b/x\n--- a/x\n+++ b/x\n@@ -1 +1 @@\n-old\n+new\n"
+ "\nNotes:\nThis fixes #123.\n"
+ )
+ patch = sbs.extract_patch(text)
+ assert patch.rstrip().endswith("+new")
+ assert "Notes:" not in patch
+ assert "This fixes" not in patch
+
+
+def test_extract_patch_keeps_multi_file_and_interior_context():
+ # Multiple files + a blank context line (represented as " ") stay intact.
+ text = (
+ "```diff\n"
+ "diff --git a/a b/a\n@@ -1,2 +1,2 @@\n context\n-x\n+y\n"
+ "diff --git a/b b/b\n@@ -1 +1 @@\n-p\n+q\n"
+ "```\nthanks!"
+ )
+ patch = sbs.extract_patch(text)
+ assert "diff --git a/a b/a" in patch
+ assert "diff --git a/b b/b" in patch
+ assert "thanks" not in patch
+
+
def test_extract_patch_empty_when_no_diff():
assert sbs.extract_patch("") == ""
# Prose with no diff markers falls back to the raw text (harness will reject).
From ce358e4049bc699fc14fc9780e129dd046c45836 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Fri, 26 Jun 2026 20:42:24 -0500
Subject: [PATCH 4/4] feat(evals): /run-evals comment command to run one eval
on one recipe
Adds a lightweight, eval-only PR-comment trigger (no perf sweep):
/run-evals [conc] [master-config]
e.g. /run-evals swebench_lite dsr1-fp4-b200-sglang 16
- benchmark_lib.sh: EVAL_FRAMEWORK env now wins over run_eval's --framework arg
so an eval-only run can override the recipes' hardcoded lm-eval (default
behavior unchanged when env unset).
- e2e-tests.yml / benchmark-tmpl.yml / benchmark-multinode-tmpl.yml: thread new
eval-framework + eval-task inputs -> EVAL_FRAMEWORK / EVAL_TASKS_DIR env.
- run-evals.yml: new comment workflow (mirrors pr-comment-sweep.yml auth/SHA-pin/
reply), maps -> framework+task, infers nvidia/amd master config from the
config-key HW token, builds 'test-config ... --evals-only', calls e2e-tests.yml.
- test_run_eval_dispatch.py: unit tests for the env-override dispatch.
NOTE: swebench scoring needs Docker on the eval runner (else SWEBENCH_SKIP_SCORE).
---
.../workflows/benchmark-multinode-tmpl.yml | 12 ++
.github/workflows/benchmark-tmpl.yml | 12 ++
.github/workflows/e2e-tests.yml | 24 +++
.github/workflows/run-evals.yml | 192 ++++++++++++++++++
benchmarks/benchmark_lib.sh | 9 +-
utils/evals/test_run_eval_dispatch.py | 53 +++++
6 files changed, 300 insertions(+), 2 deletions(-)
create mode 100644 .github/workflows/run-evals.yml
create mode 100644 utils/evals/test_run_eval_dispatch.py
diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
index 3beb246cc..b8600b049 100644
--- a/.github/workflows/benchmark-multinode-tmpl.yml
+++ b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -91,6 +91,16 @@ on:
type: string
required: false
default: ""
+ eval-framework:
+ description: "Eval framework (lm-eval | swebench). Empty = recipe default."
+ type: string
+ required: false
+ default: ""
+ eval-task:
+ description: "Eval task YAML path. Empty = framework default."
+ type: string
+ required: false
+ default: ""
scenario-type:
description: "Scenario type (fixed-seq-len or agentic-coding)"
type: string
@@ -143,6 +153,8 @@ env:
RUN_EVAL: ${{ inputs.run-eval }}
EVAL_ONLY: ${{ inputs.eval-only }}
EVAL_CONC: ${{ inputs.eval-conc }}
+ EVAL_FRAMEWORK: ${{ inputs.eval-framework }}
+ EVAL_TASKS_DIR: ${{ inputs.eval-task }}
SCENARIO_TYPE: ${{ inputs.scenario-type }}
SCENARIO_SUBDIR: ${{ inputs.scenario-type == 'agentic-coding' && 'agentic/' || 'fixed_seq_len/' }}
IS_AGENTIC: ${{ inputs.scenario-type == 'agentic-coding' && '1' || '0' }}
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index a57e89725..15c052207 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -59,6 +59,16 @@ on:
type: boolean
required: false
default: false
+ eval-framework:
+ description: "Eval framework (lm-eval | swebench). Empty = recipe default."
+ type: string
+ required: false
+ default: ""
+ eval-task:
+ description: "Eval task YAML path. Empty = framework default."
+ type: string
+ required: false
+ default: ""
random-range-ratio:
required: false
type: string
@@ -108,6 +118,8 @@ env:
DISAGG: ${{ inputs.disagg }}
RUN_EVAL: ${{ inputs.run-eval }}
EVAL_ONLY: ${{ inputs.eval-only }}
+ EVAL_FRAMEWORK: ${{ inputs.eval-framework }}
+ EVAL_TASKS_DIR: ${{ inputs.eval-task }}
SCENARIO_TYPE: ${{ inputs.scenario-type }}
SCENARIO_SUBDIR: ${{ inputs.scenario-type == 'agentic-coding' && 'agentic/' || 'fixed_seq_len/' }}
IS_AGENTIC: ${{ inputs.scenario-type == 'agentic-coding' && '1' || '0' }}
diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
index 1b83a798a..b5b65e8cf 100644
--- a/.github/workflows/e2e-tests.yml
+++ b/.github/workflows/e2e-tests.yml
@@ -21,6 +21,16 @@ on:
required: false
type: string
default: ""
+ eval-framework:
+ description: "Eval framework (lm-eval | swebench). Overrides the recipe default."
+ required: false
+ type: string
+ default: "lm-eval"
+ eval-task:
+ description: "Eval task YAML path (e.g. utils/evals/gsm8k.yaml). Empty = framework default."
+ required: false
+ type: string
+ default: ""
workflow_call:
inputs:
generate-cli-command:
@@ -40,6 +50,16 @@ on:
required: false
type: string
default: ""
+ eval-framework:
+ description: "Eval framework (lm-eval | swebench). Overrides the recipe default."
+ required: false
+ type: string
+ default: "lm-eval"
+ eval-task:
+ description: "Eval task YAML path (e.g. utils/evals/gsm8k.yaml). Empty = framework default."
+ required: false
+ type: string
+ default: ""
jobs:
get-jobs:
@@ -160,6 +180,8 @@ jobs:
run-eval: true
eval-only: true
eval-conc: ${{ matrix.config['eval-all-concs'] && join(matrix.config.conc, ' ') || matrix.config['eval-conc'] }}
+ eval-framework: ${{ inputs.eval-framework }}
+ eval-task: ${{ inputs.eval-task }}
ref: ${{ inputs.ref }}
test-sweep-agentic:
@@ -294,6 +316,8 @@ jobs:
disagg: ${{ matrix.config.disagg }}
run-eval: true
eval-only: true
+ eval-framework: ${{ inputs.eval-framework }}
+ eval-task: ${{ inputs.eval-task }}
ref: ${{ inputs.ref }}
collect-results:
diff --git a/.github/workflows/run-evals.yml b/.github/workflows/run-evals.yml
new file mode 100644
index 000000000..1d6b5d06e
--- /dev/null
+++ b/.github/workflows/run-evals.yml
@@ -0,0 +1,192 @@
+name: Slash Command Run Evals
+run-name: "/run-evals PR #${{ github.event.issue.number }}"
+
+# Comment-triggered, eval-only run of ONE eval on ONE recipe (no perf sweep).
+# Usage in a PR comment:
+# /run-evals [conc] [master-config]
+# where is one of: gsm8k | gpqa | swebench_lite (alias: swebench).
+# Example: /run-evals swebench_lite dsr1-fp4-b200-sglang 16
+# Mirrors pr-comment-sweep.yml; differs only in parsing + the eval mapping it
+# forwards to e2e-tests.yml (eval-framework / eval-task).
+
+on:
+ issue_comment:
+ types: [created]
+
+permissions:
+ contents: read
+ issues: write
+ pull-requests: write
+
+jobs:
+ get-jobs:
+ # Only run for PR comments that start with /run-evals
+ if: ${{ github.event.issue.pull_request && startsWith(github.event.comment.body, '/run-evals') }}
+ runs-on: ubuntu-latest
+ outputs:
+ pr-number: ${{ steps.parse.outputs.pr-number }}
+ generator-args: ${{ steps.parse.outputs.generator-args }}
+ eval-framework: ${{ steps.parse.outputs.eval-framework }}
+ eval-task: ${{ steps.parse.outputs.eval-task }}
+ author-can-bypass: ${{ steps.auth.outputs.can-bypass }}
+ # Immutable ref (commit SHA) to prevent TOCTOU on refs/pull//head
+ ref: ${{ steps.ref_comment.outputs.ref }}
+ steps:
+ - name: Parse PR comment (/run-evals [conc] [master])
+ id: parse
+ if: ${{ github.event_name == 'issue_comment' && github.event.issue.pull_request && startsWith(github.event.comment.body, '/run-evals') }}
+ shell: bash
+ env:
+ BODY: ${{ github.event.comment.body }}
+ PR_NUMBER: ${{ github.event.issue.number }}
+ run: |
+ set -euo pipefail
+ # Require /run-evals at the start of a line.
+ cmd_line=$(printf "%s" "$BODY" | awk '/^\/run-evals/{print; exit}')
+ if [[ -z "$cmd_line" ]]; then
+ echo "No /run-evals command found at comment start" >&2
+ exit 1
+ fi
+ # Positional args after the command.
+ read -ra parts <<< "${cmd_line#/run-evals}"
+ eval_name="${parts[0]:-}"
+ config_key="${parts[1]:-}"
+ conc="${parts[2]:-}"
+ master_override="${parts[3]:-}"
+ if [[ -z "$eval_name" || -z "$config_key" ]]; then
+ echo "usage: /run-evals [conc] [master-config]" >&2
+ echo "valid evals: gsm8k | gpqa | swebench_lite" >&2
+ exit 1
+ fi
+
+ # Map -> (framework, task YAML).
+ case "$eval_name" in
+ gsm8k) framework="lm-eval"; task="utils/evals/gsm8k.yaml" ;;
+ gpqa|gpqa_diamond) framework="lm-eval"; task="utils/evals/gpqa_diamond.yaml" ;;
+ swebench|swebench_lite) framework="swebench"; task="utils/evals/swebench_lite.yaml" ;;
+ *)
+ echo "unknown eval '$eval_name' (valid: gsm8k, gpqa, swebench_lite)" >&2
+ exit 1
+ ;;
+ esac
+
+ if [[ -n "$conc" && ! "$conc" =~ ^[1-9][0-9]*$ ]]; then
+ echo "conc must be a positive integer, got '$conc'" >&2
+ exit 1
+ fi
+
+ # Pick the platform master config from the config-key's hardware token,
+ # unless an explicit 4th arg overrides it.
+ if [[ -n "$master_override" ]]; then
+ master="$master_override"
+ elif [[ "$config_key" =~ (b200|b300|h100|h200|gb200|gb300) ]]; then
+ master=".github/configs/nvidia-master.yaml"
+ elif [[ "$config_key" =~ (mi300x|mi325x|mi355x) ]]; then
+ master=".github/configs/amd-master.yaml"
+ else
+ echo "cannot infer platform from config-key '$config_key'; pass the master config path as a 4th arg" >&2
+ exit 1
+ fi
+
+ gen="test-config --config-files ${master} --config-keys ${config_key} --evals-only"
+ if [[ -n "$conc" ]]; then
+ gen="${gen} --conc ${conc}"
+ fi
+
+ {
+ echo "generator-args=${gen}"
+ echo "eval-framework=${framework}"
+ echo "eval-task=${task}"
+ echo "pr-number=${PR_NUMBER}"
+ } >> "$GITHUB_OUTPUT"
+
+ - name: Check author permissions
+ id: auth
+ if: ${{ github.event_name == 'issue_comment' && github.event.issue.pull_request }}
+ uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+ with:
+ script: |
+ const owner = context.repo.owner;
+ const repo = context.repo.repo;
+ const username = context.payload.comment?.user?.login;
+ let permission = 'none';
+ try {
+ const res = await github.rest.repos.getCollaboratorPermissionLevel({ owner, repo, username });
+ permission = res.data?.permission || 'none';
+ } catch (e) {
+ permission = 'none';
+ }
+ const canBypass = ['admin','maintain','write'].includes(permission);
+ core.info(`Author ${username} permission: ${permission}; bypass=${canBypass}`);
+ core.setOutput('can-bypass', canBypass ? 'true' : 'false');
+
+ # ---- PR SHA pinning ----
+ - name: Resolve immutable PR ref (pin to head SHA)
+ id: ref_comment
+ if: ${{ github.event_name == 'issue_comment' && github.event.issue.pull_request && startsWith(github.event.comment.body, '/run-evals') }}
+ uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+ with:
+ script: |
+ const owner = context.repo.owner;
+ const repo = context.repo.repo;
+ const pr = context.issue.number;
+ const res = await github.rest.pulls.get({ owner, repo, pull_number: pr });
+ const sha = res.data.head.sha;
+ core.info(`Resolved PR #${pr} head SHA: ${sha}`);
+ core.setOutput('ref', sha);
+
+ - name: Reply with run link
+ if: ${{ github.event_name == 'issue_comment' && startsWith(github.event.comment.body, '/run-evals') && github.repository_owner == 'SemiAnalysisAI' }}
+ uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+ continue-on-error: true
+ env:
+ RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+ AUTHOR: ${{ github.event.comment.user.login }}
+ GEN_CMD: ${{ steps.parse.outputs.generator-args }}
+ EVAL_FRAMEWORK: ${{ steps.parse.outputs.eval-framework }}
+ CAN_BYPASS: ${{ steps.auth.outputs.can-bypass }}
+ PINNED_REF: ${{ steps.ref_comment.outputs.ref }}
+ with:
+ github-token: ${{ github.token }}
+ script: |
+ const owner = context.repo.owner;
+ const repo = context.repo.repo;
+ const issue_number = context.issue.number;
+ const runUrl = process.env.RUN_URL;
+ const author = process.env.AUTHOR;
+ const genCmd = process.env.GEN_CMD || '';
+ const framework = process.env.EVAL_FRAMEWORK || '';
+ const canBypass = (process.env.CAN_BYPASS || '').toLowerCase() === 'true';
+ const pinned = process.env.PINNED_REF || '';
+ const shortSha = pinned ? pinned.slice(0, 7) : '';
+ const approvalMsg = canBypass ? 'Approval: not required (trusted collaborator).' : "Approval: required in environment 'Outside Collaborator E2E Test'.";
+ const body = `@${author} Kicking off an eval-only run (framework: \`${framework}\`).\n\nRun: ${runUrl}\nCommand: \`${genCmd}\`\nPinned ref: \`${shortSha}\`\n${approvalMsg}`;
+ await github.rest.issues.createComment({ owner, repo, issue_number, body });
+
+ approval:
+ needs: get-jobs
+ if: ${{ github.event_name == 'issue_comment' && needs.get-jobs.outputs.pr-number != '' && needs.get-jobs.outputs.generator-args != '' && needs.get-jobs.outputs.author-can-bypass != 'true' }}
+ runs-on: ubuntu-latest
+ name: approval
+ environment: Outside Collaborator E2E Test
+ steps:
+ - run: echo "approved"
+
+ validate:
+ needs: [get-jobs, approval]
+ # always() is required to evaluate this condition when 'approval' is skipped (trusted author)
+ if: ${{ always() && needs.get-jobs.result == 'success' && needs.get-jobs.outputs.pr-number != '' && needs.get-jobs.outputs.generator-args != '' && (needs.get-jobs.outputs.author-can-bypass == 'true' || needs.approval.result == 'success') }}
+ # Concurrency at job level so non-/run-evals comments don't cancel active runs
+ concurrency:
+ group: "run-evals-PR#${{ needs.get-jobs.outputs.pr-number }}"
+ cancel-in-progress: true
+ uses: ./.github/workflows/e2e-tests.yml
+ name: validate
+ secrets: inherit
+ with:
+ generate-cli-command: ${{ needs.get-jobs.outputs.generator-args }}
+ eval-framework: ${{ needs.get-jobs.outputs.eval-framework }}
+ eval-task: ${{ needs.get-jobs.outputs.eval-task }}
+ test-name: PR #${{ needs.get-jobs.outputs.pr-number }} /run-evals
+ # Use pinned SHA to prevent TOCTOU on refs/pull//head
+ ref: ${{ needs.get-jobs.outputs.ref }}
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index fb9d35f64..060b0e926 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -1069,15 +1069,20 @@ run_swebench_eval() {
# ------------------------------
run_eval() {
- local framework="${EVAL_FRAMEWORK:-lm-eval}"
+ # EVAL_FRAMEWORK (env) wins over the --framework arg so an eval-only run
+ # (e.g. the /run-evals command) can override the recipe scripts' hardcoded
+ # `--framework lm-eval`. With the env unset, the CLI arg (else lm-eval) is
+ # used exactly as before.
+ local cli_framework=""
local forwarded=()
while [[ $# -gt 0 ]]; do
case "$1" in
- --framework) framework="$2"; shift 2 ;;
+ --framework) cli_framework="$2"; shift 2 ;;
*) forwarded+=("$1"); shift ;;
esac
done
+ local framework="${EVAL_FRAMEWORK:-${cli_framework:-lm-eval}}"
# Compute EVAL_MAX_MODEL_LEN if not already set by the calling script
if [ -z "${EVAL_MAX_MODEL_LEN:-}" ]; then
diff --git a/utils/evals/test_run_eval_dispatch.py b/utils/evals/test_run_eval_dispatch.py
new file mode 100644
index 000000000..44a5d785a
--- /dev/null
+++ b/utils/evals/test_run_eval_dispatch.py
@@ -0,0 +1,53 @@
+"""run_eval framework dispatch: EVAL_FRAMEWORK (env) overrides the --framework arg.
+
+This is what lets `/run-evals swebench_lite ...` run swebench even though every
+recipe script hardcodes `run_eval --framework lm-eval`. With the env unset, the
+CLI arg (else lm-eval) is used as before.
+"""
+
+import os
+import subprocess
+from pathlib import Path
+
+BENCHMARK_LIB = Path(__file__).resolve().parents[2] / "benchmarks" / "benchmark_lib.sh"
+
+# Stub the framework runners so dispatch is observable without a server/Docker,
+# and pin EVAL_MAX_MODEL_LEN so run_eval skips context computation.
+_SCRIPT = r'''
+source "$BENCHMARK_LIB"
+run_lm_eval() { echo "DISPATCH=lm-eval"; }
+run_swebench_eval() { echo "DISPATCH=swebench"; }
+export EVAL_MAX_MODEL_LEN=16384
+unset EVAL_CONCURRENT_REQUESTS
+run_eval --framework "$CLI_FW" --port 8888
+'''
+
+
+def _dispatch(cli_fw: str, env_fw: str | None) -> str:
+ env = {**os.environ, "BENCHMARK_LIB": str(BENCHMARK_LIB), "CLI_FW": cli_fw}
+ env.pop("EVAL_FRAMEWORK", None)
+ if env_fw is not None:
+ env["EVAL_FRAMEWORK"] = env_fw
+ res = subprocess.run(
+ ["bash", "-c", _SCRIPT], env=env, text=True, capture_output=True, check=True
+ )
+ return res.stdout
+
+
+def test_env_framework_overrides_cli_arg():
+ # recipe passes --framework lm-eval, but EVAL_FRAMEWORK=swebench wins.
+ assert "DISPATCH=swebench" in _dispatch("lm-eval", "swebench")
+
+
+def test_cli_arg_used_when_env_unset():
+ assert "DISPATCH=lm-eval" in _dispatch("lm-eval", None)
+
+
+def test_swebench_via_cli_arg_when_env_unset():
+ assert "DISPATCH=swebench" in _dispatch("swebench", None)
+
+
+def test_empty_env_falls_back_to_cli_arg():
+ # An empty EVAL_FRAMEWORK (how the template passes it when unset) must not
+ # force anything -- the CLI arg still wins.
+ assert "DISPATCH=lm-eval" in _dispatch("lm-eval", "")