From 27d164103995aea671631cfc0011479afd86f8b8 Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Fri, 26 Jun 2026 18:53:23 -0500 Subject: [PATCH 1/4] feat(evals): add SWE-bench Lite accuracy eval lm-eval cannot score SWE-bench (no repo-level Docker test executor), so this reuses lm-eval for patch *generation* and adds a scoring step that runs the official swebench harness, emitting an lm-eval-shaped results JSON so the existing collect/validate pipeline works unchanged. - swebench_lite.yaml: lm-eval generate_until task over SWE-bench Lite - swebench_score.py: diff extraction -> predictions.jsonl -> swebench harness -> resolved-rate -> lm-eval-shaped results; offline --report/--predictions-only - benchmark_lib.sh: run_swebench_eval + --framework swebench dispatch - collect_eval_results.py: recognize 'resolved' filter as the primary score - thresholds.json: placeholder swebench_lite entry (needs calibration) - EVALS.md: document the new framework + task - test_swebench_eval.py: unit + integration tests --- benchmarks/benchmark_lib.sh | 78 +++++++ utils/collect_eval_results.py | 4 +- utils/evals/EVALS.md | 25 +++ utils/evals/swebench_lite.yaml | 56 +++++ utils/evals/swebench_score.py | 340 ++++++++++++++++++++++++++++++ utils/evals/test_swebench_eval.py | 182 ++++++++++++++++ utils/evals/thresholds.json | 3 +- 7 files changed, 686 insertions(+), 2 deletions(-) create mode 100644 utils/evals/swebench_lite.yaml create mode 100644 utils/evals/swebench_score.py create mode 100644 utils/evals/test_swebench_eval.py diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 95e063a3d..70353e65d 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -966,6 +966,83 @@ META echo "Moved eval artifacts to: $(pwd)" } +# ------------------------------ +# SWE-bench eval helpers +# ------------------------------ + +# Run the SWE-bench Lite eval: generate patches with lm-eval, then score them +# with the official swebench Docker harness. lm-eval cannot score SWE-bench +# itself (no repo-level test executor), so we reuse it only for generation and +# emit an lm-eval-shaped results JSON from swebench_score.py so the rest of the +# pipeline (append_lm_eval_summary / collect / validate) is unchanged. +# +# Env knobs: +# SWEBENCH_DATASET (default princeton-nlp/SWE-bench_Lite) +# SWEBENCH_TASK_NAME (default swebench_lite) +# SWEBENCH_MAX_WORKERS (default 4) harness Docker workers +# SWEBENCH_NAMESPACE pass "" on arm/Mac to build images locally +# SWEBENCH_SKIP_SCORE "true" => generate + stage predictions only, no Docker +# (for runners without Docker; score elsewhere) +run_swebench_eval() { + local out_dir="${EVAL_RESULT_DIR:-$(mktemp -d /tmp/eval_out-XXXXXX)}" + local task_name="${SWEBENCH_TASK_NAME:-swebench_lite}" + local gen_dir + gen_dir=$(mktemp -d /tmp/swebench_gen-XXXXXX) + + # 1. Generation via lm-eval (reuses endpoint wiring, _patch_lm_eval, etc.). + # run_lm_eval already passes --log_samples, which is what we consume. + local prev_tasks_dir="${EVAL_TASKS_DIR:-}" + export EVAL_TASKS_DIR="${EVAL_TASKS_DIR:-utils/evals/${task_name}.yaml}" + local gen_rc=0 + run_lm_eval "$@" --results-dir "$gen_dir" || gen_rc=$? + export EVAL_TASKS_DIR="$prev_tasks_dir" + if [ "$gen_rc" -ne 0 ]; then + echo "ERROR: swebench generation (lm-eval) failed with $gen_rc" >&2 + rm -rf "$gen_dir" 2>/dev/null || true + return "$gen_rc" + fi + + # Preserve generations as artifacts alongside the scored results. + mkdir -p "$out_dir" + find "$gen_dir" -name 'samples_*.jsonl' -exec cp -f {} "$out_dir"/ \; 2>/dev/null || true + export EVAL_RESULT_DIR="$out_dir" + + local lm_eval_version + lm_eval_version=$(python3 -c 'import lm_eval; print(lm_eval.__version__)' 2>/dev/null || echo unknown) + + if [ "${SWEBENCH_SKIP_SCORE:-false}" = "true" ]; then + # Generation-only mode: emit predictions, defer Docker scoring elsewhere. + # TODO(alec): wire the separate scoring job (Modal / sb-cli / CPU runner). + local skip_rc=0 + python3 utils/evals/swebench_score.py \ + --samples-dir "$gen_dir" --out-dir "$out_dir" \ + --model-name "${MODEL_NAME:-$MODEL}" --task-name "$task_name" \ + --predictions-only || skip_rc=$? + echo "SWEBENCH_SKIP_SCORE=true: staged predictions only (no resolved-rate)." >&2 + rm -rf "$gen_dir" 2>/dev/null || true + return "$skip_rc" + fi + + # 2. Score with the official swebench harness (requires Docker) and emit the + # lm-eval-shaped results JSON into EVAL_RESULT_DIR. + local score_rc=0 + python3 utils/evals/swebench_score.py \ + --samples-dir "$gen_dir" \ + --out-dir "$out_dir" \ + --model-name "${MODEL_NAME:-$MODEL}" \ + --task-name "$task_name" \ + --dataset-name "${SWEBENCH_DATASET:-princeton-nlp/SWE-bench_Lite}" \ + --max-workers "${SWEBENCH_MAX_WORKERS:-4}" \ + --lm-eval-version "$lm_eval_version" \ + ${SWEBENCH_NAMESPACE+--namespace "$SWEBENCH_NAMESPACE"} \ + || score_rc=$? + rm -rf "$gen_dir" 2>/dev/null || true + if [ "$score_rc" -ne 0 ]; then + echo "ERROR: swebench scoring failed with $score_rc" >&2 + return "$score_rc" + fi +} + # ------------------------------ # Unified eval entrypoint # ------------------------------ @@ -1052,6 +1129,7 @@ run_eval() { local eval_rc=0 case "$framework" in lm-eval|lm_eval) run_lm_eval "${forwarded[@]}" || eval_rc=$? ;; + swebench) run_swebench_eval "${forwarded[@]}" || eval_rc=$? ;; *) echo "Unknown framework '${framework}'"; eval_rc=1 ;; esac diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py index 194fa4acb..f98a6f7c4 100644 --- a/utils/collect_eval_results.py +++ b/utils/collect_eval_results.py @@ -141,7 +141,9 @@ def get_val_se(filter_name: str) -> Tuple[Optional[float], Optional[float]]: # Extract metrics for each filter for f in filter_list: fname = f['name'] - if 'strict' in fname: + # 'resolved' is SWE-bench's resolved-rate (swebench_score.py); + # treat it as the primary/strict score so it populates `score`. + if 'strict' in fname or 'resolved' in fname: strict_val, strict_se = get_val_se(fname) elif 'flex' in fname or 'extract' in fname: flex_val, flex_se = get_val_se(fname) diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md index 7ff878dce..752e13131 100644 --- a/utils/evals/EVALS.md +++ b/utils/evals/EVALS.md @@ -169,7 +169,32 @@ The codebase patches lm-eval compatibility via `_patch_lm_eval`: 1. Reasoning token handling: extracts `reasoning_content` when `message.content` is empty. 2. TRT compatibility: avoids injecting `{"type": "text"}` for non-HF tokenizers. +### SWE-bench Lite (`--framework swebench`) + +SWE-bench is **not** a `generate_until` QA task — it requires applying the model's +patch to a repo and running tests in Docker, which lm-eval cannot do. So it runs +through a dedicated framework that reuses lm-eval for *generation* only, then scores +with the official `swebench` harness and emits an lm-eval-shaped results JSON +(metric `exact_match,resolved` = resolved-rate) so collect/validate work unchanged. + +```bash +run_eval --framework swebench --port "$PORT" # generation (lm-eval) -> scoring (swebench) +append_lm_eval_summary +``` + +- Task: `utils/evals/swebench_lite.yaml` (generation) — SWE-bench Lite, the ~300-instance curated + quick-eval subset (no difficulty filter needed; Lite is already the lightweight set). +- Scoring: `utils/evals/swebench_score.py` (diff extraction → `predictions.jsonl` → + `python -m swebench.harness.run_evaluation` → resolved-rate → results JSON). Offline + `--report` mode skips Docker for testing. +- Knobs: `SWEBENCH_DATASET`, `SWEBENCH_TASK_NAME`, `SWEBENCH_MAX_WORKERS`, + `SWEBENCH_NAMESPACE` (pass `""` on arm/Mac), `SWEBENCH_SKIP_SCORE=true` (generate-only). +- **Requires Docker + ~120 GB disk on the scoring host.** This is an MVP; the single-shot prompt and + diff extraction still need tuning to reach published resolved-rates, and the `thresholds.json` entry + needs calibration from a baseline run. + ## Task files The following files are task definitions from lm-eval; more information on changes lives within the files: - `utils/evals/gsm8k.yaml` - `utils/evals/gpqa_diamond.yaml` +- `utils/evals/swebench_lite.yaml` (generation only; scored by `swebench_score.py`) diff --git a/utils/evals/swebench_lite.yaml b/utils/evals/swebench_lite.yaml new file mode 100644 index 000000000..4633af462 --- /dev/null +++ b/utils/evals/swebench_lite.yaml @@ -0,0 +1,56 @@ +# SWE-bench Lite -- GENERATION ONLY. +# +# Lite is the ~300-instance curated subset for quick evals (no difficulty labels; +# it's already the lightweight set, so no filtering is needed -- unlike Verified, +# which carries a `difficulty` field). +# +# lm-eval is used purely to drive the served OpenAI-compatible endpoint and dump +# one candidate patch per instance via --log_samples. The metric below is a +# PLACEHOLDER that lm-eval computes but we ignore: the real resolved-rate comes +# from utils/evals/swebench_score.py running the official `swebench` harness, +# which then emits an lm-eval-shaped results JSON for collect/validate. +# +# Run it through the dedicated framework, not bare lm-eval: +# run_eval --framework swebench --port "$PORT" +# which wires generation -> scoring. Bare `--tasks swebench_lite.yaml` would +# produce only the meaningless placeholder metric. +task: swebench_lite +dataset_path: princeton-nlp/SWE-bench_Lite # also mirrored at SWE-bench/SWE-bench_Lite +output_type: generate_until +test_split: test + +doc_to_text: | + You are an expert software engineer fixing a real GitHub issue in the + repository `{{repo}}` at commit {{base_commit}}. + + + {{problem_statement}} + + + Respond with ONLY a unified diff (a git patch) that resolves the issue, using + real repository file paths. Do not include explanations. Wrap the patch in a + single fenced block exactly like: + + ```diff + diff --git a/path/to/file.py b/path/to/file.py + --- a/path/to/file.py + +++ b/path/to/file.py + @@ ... @@ + ``` +# The gold patch is the nominal target. lm-eval's exact_match against it is +# meaningless for patches (overwritten by the harness score); it only exists so +# generate_until has a target + a metric and does not error. +doc_to_target: "{{patch}}" + +generation_kwargs: + until: [] + do_sample: false + temperature: 0.0 + +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + +metadata: + version: 0.1 diff --git a/utils/evals/swebench_score.py b/utils/evals/swebench_score.py new file mode 100644 index 000000000..edf8ef212 --- /dev/null +++ b/utils/evals/swebench_score.py @@ -0,0 +1,340 @@ +#!/usr/bin/env python3 +"""Score SWE-bench patches generated by lm-eval and emit an lm-eval-shaped result. + +Pipeline: + + 1. Read lm-eval ``--log_samples`` output (samples_*.jsonl): one candidate per + SWE-bench instance. + 2. Extract a unified diff from each model generation. + 3. Write a ``predictions.jsonl`` in the format the official ``swebench`` harness + expects: ``{instance_id, model_name_or_path, model_patch}``. + 4. Run ``python -m swebench.harness.run_evaluation`` (Docker) to get the + resolved-rate -- unless ``--no-run``/``--report`` is given (offline/testing). + 5. Emit a results JSON shaped like an lm-eval result so the existing + ``collect_eval_results.py`` / ``validate_scores.py`` ingest it unchanged. + The metric is published as ``exact_match,resolved`` = resolved-rate. + +The harness needs Docker + lots of disk and is NOT runnable on this dev Mac, so +the Docker step is isolated behind ``--no-run`` for local testing. TODO(alec): +exercise the real ``--run`` path on a runner. +""" + +import argparse +import json +import math +import re +import subprocess +import sys +from pathlib import Path +from typing import Any, Iterator, Optional + +DEFAULT_DATASET = "princeton-nlp/SWE-bench_Lite" +DEFAULT_TASK = "swebench_lite" + +# A unified diff, optionally inside a ```diff / ```patch fence. We try fenced +# first (what the prompt asks for), then a bare ``diff --git`` slice. +_FENCED_DIFF_RE = re.compile( + r"```(?:diff|patch)?\s*\n(?P.*?)```", + re.DOTALL | re.IGNORECASE, +) +_DIFF_GIT_RE = re.compile(r"(?:^|\n)(diff --git .*)", re.DOTALL) + + +def extract_patch(text: str) -> str: + """Pull a unified diff out of a model generation. + + Best-effort and deliberately conservative -- a wrong extraction just means + that instance is unresolved, never a crash. Diff-extraction quality is a + primary tuning lever (TODO(alec)): bad fences here directly suppress the + resolved-rate. + """ + if not text: + return "" + # 1. Prefer a fenced block that actually looks like a diff. + for match in _FENCED_DIFF_RE.finditer(text): + body = match.group("body") + if "diff --git" in body or body.lstrip().startswith(("--- ", "+++ ")): + return body.strip("\n") + "\n" + # 2. Fall back to the first ``diff --git`` to end-of-text. + git_match = _DIFF_GIT_RE.search(text) + if git_match: + return git_match.group(1).strip("\n") + "\n" + # 3. Last resort: a lone fenced block, or the raw text. + lone = _FENCED_DIFF_RE.search(text) + if lone: + return lone.group("body").strip("\n") + "\n" + return text.strip("\n") + "\n" if text.strip() else "" + + +def _response_text(record: dict) -> str: + """Extract the model's text from one lm-eval sample record. + + lm-eval's sample schema has drifted across versions; be tolerant. + TODO(alec): confirm against the pinned harness's real samples_*.jsonl. + """ + for key in ("filtered_resps", "resps"): + val = record.get(key) + while isinstance(val, (list, tuple)) and val: + val = val[0] + if isinstance(val, str) and val.strip(): + return val + return "" + + +def _instance_id(record: dict) -> Optional[str]: + doc = record.get("doc") + if isinstance(doc, dict): + for key in ("instance_id", "instance", "id"): + val = doc.get(key) + if isinstance(val, str) and val: + return val + # Some versions hoist doc fields to the top level. + val = record.get("instance_id") + return val if isinstance(val, str) and val else None + + +def iter_samples(samples_dir: Path) -> Iterator[dict]: + """Yield JSON records from every samples_*.jsonl under ``samples_dir``.""" + files = sorted(samples_dir.rglob("samples_*.jsonl")) + if not files: + raise FileNotFoundError( + f"no samples_*.jsonl found under {samples_dir} -- did lm-eval run " + "with --log_samples?" + ) + for path in files: + with path.open() as fh: + for line in fh: + line = line.strip() + if line: + yield json.loads(line) + + +def build_predictions(samples_dir: Path, model_name: str) -> list[dict]: + """Turn lm-eval samples into swebench prediction rows (dedup by instance).""" + by_instance: dict[str, dict] = {} + skipped = 0 + for record in iter_samples(samples_dir): + instance_id = _instance_id(record) + if not instance_id: + skipped += 1 + continue + patch = extract_patch(_response_text(record)) + # Last write wins; SWE-bench is single-attempt so there should be one + # record per instance anyway. + by_instance[instance_id] = { + "instance_id": instance_id, + "model_name_or_path": model_name, + "model_patch": patch, + } + if skipped: + print(f"WARN: skipped {skipped} sample(s) with no instance_id", file=sys.stderr) + if not by_instance: + raise ValueError("no usable predictions extracted from samples") + return list(by_instance.values()) + + +def write_predictions(predictions: list[dict], out_path: Path) -> None: + with out_path.open("w") as fh: + for row in predictions: + fh.write(json.dumps(row) + "\n") + + +def run_harness( + predictions_path: Path, + dataset_name: str, + run_id: str, + work_dir: Path, + max_workers: int, + namespace: Optional[str], +) -> None: + """Invoke the official swebench Docker harness (requires Docker).""" + cmd = [ + sys.executable, "-m", "swebench.harness.run_evaluation", + "--dataset_name", dataset_name, + "--predictions_path", str(predictions_path), + "--run_id", run_id, + "--max_workers", str(max_workers), + ] + if namespace is not None: + # On arm/Mac (and to force local image builds) pass --namespace ''. + cmd += ["--namespace", namespace] + print(f"[swebench] running: {' '.join(cmd)}", flush=True) + subprocess.run(cmd, cwd=str(work_dir), check=True) + + +def find_report(work_dir: Path, model_name: str, run_id: str) -> Path: + """Locate the harness report JSON, tolerant to known layout variants.""" + sanitized = model_name.replace("/", "__") + candidates = [ + work_dir / f"{sanitized}.{run_id}.json", # classic: ..json + work_dir / f"{model_name}.{run_id}.json", + work_dir / "evaluation_results" / "results.json", # newer layout + ] + for path in candidates: + if path.exists(): + return path + # Broad fallback: any *.json mentioning resolved/total at the top level. + for path in sorted(work_dir.rglob("*.json")): + try: + data = json.loads(path.read_text()) + except (json.JSONDecodeError, OSError): + continue + if isinstance(data, dict) and ( + "resolved_instances" in data or "resolved_ids" in data + ): + return path + raise FileNotFoundError( + f"could not locate a swebench report under {work_dir} " + f"(looked for {[str(c) for c in candidates]})" + ) + + +def parse_resolved(report: dict) -> tuple[int, int]: + """Return (resolved, total) from a harness report, tolerant to key variants. + + Denominator is the full instance count (leaderboard convention: + resolved / total), not just completed instances. + """ + resolved: Optional[int] = None + for key in ("resolved_instances", "resolved", "num_resolved"): + if isinstance(report.get(key), int): + resolved = report[key] + break + if resolved is None and isinstance(report.get("resolved_ids"), list): + resolved = len(report["resolved_ids"]) + + total: Optional[int] = None + for key in ("total_instances", "completed_instances", "submitted_instances"): + val = report.get(key) + if isinstance(val, int) and val > 0: + total = val + break + if total is None: + for key in ("completed_ids", "submitted_ids"): + if isinstance(report.get(key), list) and report[key]: + total = len(report[key]) + break + + if resolved is None or total is None or total <= 0: + raise ValueError( + f"could not parse resolved/total from report keys {sorted(report)}" + ) + return resolved, total + + +def build_results_json( + task: str, + resolved: int, + total: int, + model_name: str, + lm_eval_version: str, + report: Optional[dict], +) -> dict: + """Shape the resolved-rate as an lm-eval result. + + Published as ``exact_match,resolved`` so validate_scores (prefix + ``exact_match,``) gates it and collect_eval_results surfaces it as ``score``. + """ + rate = resolved / total + stderr = math.sqrt(rate * (1.0 - rate) / total) if total else 0.0 + return { + "lm_eval_version": lm_eval_version, + "model_name": model_name, + "results": { + task: { + "alias": task, + "exact_match,resolved": rate, + "exact_match_stderr,resolved": stderr, + } + }, + "configs": { + task: { + "metric_list": [{"metric": "exact_match"}], + "filter_list": [{"name": "resolved"}], + } + }, + "n-samples": {task: {"effective": total, "original": total}}, + # Debugging passthrough; ignored by collectors (no lm_eval_version here). + "swebench": { + "resolved": resolved, + "total": total, + "resolved_rate": rate, + "report": report, + }, + } + + +def main(argv: Optional[list[str]] = None) -> int: + parser = argparse.ArgumentParser(description="Score SWE-bench patches from lm-eval samples") + parser.add_argument("--samples-dir", required=True, help="dir containing lm-eval samples_*.jsonl") + parser.add_argument("--out-dir", required=True, help="dir to write predictions + results JSON") + parser.add_argument("--model-name", required=True, help="served model name (model_name_or_path)") + parser.add_argument("--dataset-name", default=DEFAULT_DATASET) + parser.add_argument("--task-name", default=DEFAULT_TASK) + parser.add_argument("--run-id", default=None, help="harness run id (default: task name)") + parser.add_argument("--max-workers", type=int, default=4) + parser.add_argument( + "--namespace", default=None, + help="swebench --namespace value (pass '' on arm/Mac to build images locally)", + ) + parser.add_argument("--lm-eval-version", default="unknown") + parser.add_argument( + "--predictions-only", action="store_true", + help="write predictions.jsonl and stop (no scoring; score elsewhere)", + ) + parser.add_argument( + "--no-run", action="store_true", + help="skip the Docker harness; requires --report (offline/testing)", + ) + parser.add_argument( + "--report", default=None, + help="path to a pre-computed harness report JSON (implies --no-run)", + ) + args = parser.parse_args(argv) + + samples_dir = Path(args.samples_dir) + out_dir = Path(args.out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + run_id = args.run_id or args.task_name + + # 1-3. samples -> predictions.jsonl + predictions = build_predictions(samples_dir, args.model_name) + predictions_path = out_dir / "predictions.jsonl" + write_predictions(predictions, predictions_path) + print(f"[swebench] wrote {len(predictions)} predictions -> {predictions_path}") + + if args.predictions_only: + print("[swebench] predictions-only: skipping scoring (score elsewhere)") + return 0 + + # 4. score (Docker) or load an existing report + if args.report: + report = json.loads(Path(args.report).read_text()) + elif args.no_run: + print("ERROR: --no-run requires --report", file=sys.stderr) + return 1 + else: + run_harness( + predictions_path, args.dataset_name, run_id, + out_dir, args.max_workers, args.namespace, + ) + report = json.loads(find_report(out_dir, args.model_name, run_id).read_text()) + + resolved, total = parse_resolved(report) + + # 5. emit lm-eval-shaped results + results = build_results_json( + args.task_name, resolved, total, args.model_name, + args.lm_eval_version, report, + ) + results_path = out_dir / f"results_{args.task_name}.json" + results_path.write_text(json.dumps(results, indent=2)) + print( + f"[swebench] {args.task_name}: resolved {resolved}/{total} " + f"= {resolved / total:.4f} -> {results_path}" + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/utils/evals/test_swebench_eval.py b/utils/evals/test_swebench_eval.py new file mode 100644 index 000000000..8e59975ca --- /dev/null +++ b/utils/evals/test_swebench_eval.py @@ -0,0 +1,182 @@ +"""Tests for the SWE-bench Lite eval MVP (generation -> scoring -> lm-eval shape). + +Pure-stdlib paths (extract_patch, predictions, report parsing, results shape) +run on any interpreter. The dataset filter and the collect/validate integration +guard on optional deps / interpreter version so the file imports cleanly even on +the macOS system python 3.9 used for local spot-checks. +""" + +import json +import sys +from pathlib import Path + +import pytest + +sys.path.insert(0, str(Path(__file__).resolve().parent)) # utils/evals +sys.path.insert(0, str(Path(__file__).resolve().parents[1])) # utils + +import swebench_score as sbs + + +# --- diff extraction ------------------------------------------------------- + +def test_extract_patch_from_diff_fence(): + text = ( + "Here is the fix:\n\n```diff\n" + "diff --git a/f.py b/f.py\n--- a/f.py\n+++ b/f.py\n" + "@@ -1 +1 @@\n-old\n+new\n```\nDone." + ) + patch = sbs.extract_patch(text) + assert patch.startswith("diff --git a/f.py b/f.py") + assert patch.endswith("\n") + assert "Here is the fix" not in patch + assert "Done." not in patch + + +def test_extract_patch_bare_diff_git(): + text = "no fence\ndiff --git a/x b/x\n@@ @@\n-a\n+b\n" + patch = sbs.extract_patch(text) + assert patch.startswith("diff --git a/x b/x") + assert "no fence" not in patch + + +def test_extract_patch_empty_when_no_diff(): + assert sbs.extract_patch("") == "" + # Prose with no diff markers falls back to the raw text (harness will reject). + assert sbs.extract_patch("just words").strip() == "just words" + + +# --- samples -> predictions ------------------------------------------------ + +def _write_samples(dirpath: Path, records: list[dict]) -> None: + with (dirpath / "samples_swebench_lite_2026.jsonl").open("w") as fh: + for rec in records: + fh.write(json.dumps(rec) + "\n") + + +def test_build_predictions_extracts_instance_and_patch(tmp_path): + _write_samples(tmp_path, [ + { + "doc": {"instance_id": "repo__proj-1"}, + "filtered_resps": ["```diff\ndiff --git a/a b/a\n+x\n```"], + }, + { + "doc": {"instance_id": "repo__proj-2"}, + "resps": [["diff --git a/b b/b\n+y\n"]], + }, + ]) + preds = sbs.build_predictions(tmp_path, "my-model") + by_id = {p["instance_id"]: p for p in preds} + assert set(by_id) == {"repo__proj-1", "repo__proj-2"} + assert by_id["repo__proj-1"]["model_name_or_path"] == "my-model" + assert by_id["repo__proj-1"]["model_patch"].startswith("diff --git a/a b/a") + assert by_id["repo__proj-2"]["model_patch"].startswith("diff --git a/b b/b") + + +def test_build_predictions_raises_without_samples(tmp_path): + with pytest.raises(FileNotFoundError): + sbs.build_predictions(tmp_path, "m") + + +# --- report parsing -------------------------------------------------------- + +def test_parse_resolved_classic_counts(): + assert sbs.parse_resolved( + {"resolved_instances": 80, "total_instances": 196} + ) == (80, 196) + + +def test_parse_resolved_from_id_lists(): + report = {"resolved_ids": ["a", "b", "c"], "completed_ids": ["a", "b", "c", "d"]} + # no total_instances -> falls back to completed_ids length + assert sbs.parse_resolved(report) == (3, 4) + + +def test_parse_resolved_raises_on_garbage(): + with pytest.raises(ValueError): + sbs.parse_resolved({"nope": 1}) + + +# --- lm-eval-shaped results ------------------------------------------------ + +def test_build_results_json_is_lm_eval_shaped(): + res = sbs.build_results_json( + "swebench_lite", 49, 196, "m", "0.4.12", {"resolved_instances": 49} + ) + assert "lm_eval_version" in res # detection key for collect_eval_results + task = res["results"]["swebench_lite"] + assert task["exact_match,resolved"] == pytest.approx(0.25) + cfg = res["configs"]["swebench_lite"] + assert cfg["filter_list"] == [{"name": "resolved"}] + assert res["n-samples"]["swebench_lite"]["effective"] == 196 + + +def test_score_offline_end_to_end(tmp_path): + """--report path: samples -> predictions + results JSON, no Docker.""" + samples = tmp_path / "gen" + samples.mkdir() + _write_samples(samples, [ + {"doc": {"instance_id": "r__p-1"}, "filtered_resps": ["```diff\ndiff --git a/a b/a\n+x\n```"]}, + ]) + report = tmp_path / "report.json" + report.write_text(json.dumps({"resolved_instances": 1, "total_instances": 1})) + out = tmp_path / "out" + rc = sbs.main([ + "--samples-dir", str(samples), "--out-dir", str(out), + "--model-name", "m", "--report", str(report), + ]) + assert rc == 0 + assert (out / "predictions.jsonl").exists() + results = json.loads((out / "results_swebench_lite.json").read_text()) + assert results["results"]["swebench_lite"]["exact_match,resolved"] == 1.0 + + +def test_predictions_only_writes_predictions_no_results(tmp_path): + """SWEBENCH_SKIP_SCORE path: predictions only, no Docker, no results JSON.""" + samples = tmp_path / "gen" + samples.mkdir() + _write_samples(samples, [ + {"doc": {"instance_id": "r__p-1"}, "filtered_resps": ["```diff\ndiff --git a/a b/a\n+x\n```"]}, + ]) + out = tmp_path / "out" + rc = sbs.main([ + "--samples-dir", str(samples), "--out-dir", str(out), + "--model-name", "m", "--predictions-only", + ]) + assert rc == 0 + assert (out / "predictions.jsonl").exists() + assert not (out / "results_swebench_lite.json").exists() + + +# --- integration with the existing pipeline (needs tabulate + py3.10+) ----- + +@pytest.mark.skipif(sys.version_info < (3, 10), reason="repo modules use py3.10 syntax") +def test_results_json_flows_through_collect_and_validate(tmp_path, monkeypatch): + pytest.importorskip("tabulate") + import collect_eval_results as cer + import validate_scores as vs + + art = tmp_path / "eval" + art.mkdir() + (art / "meta_env.json").write_text(json.dumps({ + "infmax_model_prefix": "dsr1", "hw": "b200", "framework": "sglang", + "precision": "fp8", "isl": 8192, "osl": 1024, + })) + res = sbs.build_results_json( + "swebench_lite", 150, 300, "dsr1", "0.4.12", None + ) + (art / "results_swebench_lite.json").write_text(json.dumps(res)) + + # collect surfaces the resolved-rate as the unified `score`. + rows = cer.collect_eval_rows(tmp_path) + assert len(rows) == 1 + assert rows[0]["task"] == "swebench_lite" + assert rows[0]["score"] == pytest.approx(0.5) + + # validate_scores gates exact_match,resolved against thresholds.json (0.10). + monkeypatch.chdir(art) + monkeypatch.setattr(sys, "argv", [ + "validate_scores.py", + "--results-glob", "results_swebench_lite.json", + ]) + assert vs.main() == 0 # 0.5 >= 0.10 default threshold diff --git a/utils/evals/thresholds.json b/utils/evals/thresholds.json index d6c091152..cbbe65105 100644 --- a/utils/evals/thresholds.json +++ b/utils/evals/thresholds.json @@ -1,7 +1,8 @@ { "default": { "gsm8k": 0.90, - "gpqa_diamond_cot_n_shot": 0.30 + "gpqa_diamond_cot_n_shot": 0.30, + "swebench_lite": 0.10 }, "models": { "dsr1": { From 4886287d3daec61a1f4da3bd9dcdbdb5f014a23d Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Fri, 26 Jun 2026 19:02:58 -0500 Subject: [PATCH 2/4] fix(evals): remove unused Any import in swebench_score.py Addresses CodeQL (github-code-quality) finding on the PR. --- utils/evals/swebench_score.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/evals/swebench_score.py b/utils/evals/swebench_score.py index edf8ef212..371260443 100644 --- a/utils/evals/swebench_score.py +++ b/utils/evals/swebench_score.py @@ -26,7 +26,7 @@ import subprocess import sys from pathlib import Path -from typing import Any, Iterator, Optional +from typing import Iterator, Optional DEFAULT_DATASET = "princeton-nlp/SWE-bench_Lite" DEFAULT_TASK = "swebench_lite" From 6d5972abe8ac693a4a7fa845c6035301958b1bd7 Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Fri, 26 Jun 2026 19:15:11 -0500 Subject: [PATCH 3/4] fix(evals): address PR review (claude bot) findings on swebench - swebench_score.py: bound the bare-diff fallback to the diff body so trailing prose after a patch can't be glued on (would fail git apply -> unresolved, suppressing resolved-rate). Add _trim_to_diff_body + regression tests. - benchmark_lib.sh: derive the scoring dataset from the generation YAML's dataset_path so generation/scoring can't diverge; SWEBENCH_DATASET (if set) must match or it fails fast. Update docstring + EVALS.md. CodeQL unused-import (Any) already fixed in 4886287d. --- benchmarks/benchmark_lib.sh | 29 +++++++++++++--- utils/evals/EVALS.md | 6 ++-- utils/evals/swebench_score.py | 57 ++++++++++++++++++++++++++++--- utils/evals/test_swebench_eval.py | 27 +++++++++++++++ 4 files changed, 108 insertions(+), 11 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 70353e65d..fb9d35f64 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -977,8 +977,10 @@ META # pipeline (append_lm_eval_summary / collect / validate) is unchanged. # # Env knobs: -# SWEBENCH_DATASET (default princeton-nlp/SWE-bench_Lite) -# SWEBENCH_TASK_NAME (default swebench_lite) +# SWEBENCH_TASK_NAME (default swebench_lite) selects utils/evals/.yaml +# SWEBENCH_DATASET optional; must equal the YAML's dataset_path (the +# scoring dataset is derived from the YAML so generation +# and scoring never diverge) -- mismatch fails fast # SWEBENCH_MAX_WORKERS (default 4) harness Docker workers # SWEBENCH_NAMESPACE pass "" on arm/Mac to build images locally # SWEBENCH_SKIP_SCORE "true" => generate + stage predictions only, no Docker @@ -989,10 +991,29 @@ run_swebench_eval() { local gen_dir gen_dir=$(mktemp -d /tmp/swebench_gen-XXXXXX) + # Keep the scoring dataset in lockstep with the generation YAML: the harness + # must score against the same instance set lm-eval generated patches for, or + # the instance IDs won't match. Derive it from the task YAML; if + # SWEBENCH_DATASET is set it must agree (fail-fast rather than mis-score). + local yaml_path="${EVAL_TASKS_DIR:-utils/evals/${task_name}.yaml}" + local dataset + dataset=$(awk '/^dataset_path:[[:space:]]/{print $2; exit}' "$yaml_path" 2>/dev/null) + if [ -z "$dataset" ]; then + echo "ERROR: could not read dataset_path from ${yaml_path}" >&2 + rm -rf "$gen_dir" 2>/dev/null || true + return 1 + fi + if [ -n "${SWEBENCH_DATASET:-}" ] && [ "${SWEBENCH_DATASET}" != "$dataset" ]; then + echo "ERROR: SWEBENCH_DATASET='${SWEBENCH_DATASET}' disagrees with ${yaml_path} dataset_path='${dataset}'." >&2 + echo " Generation and scoring must use the same dataset; edit the YAML or unset SWEBENCH_DATASET." >&2 + rm -rf "$gen_dir" 2>/dev/null || true + return 1 + fi + # 1. Generation via lm-eval (reuses endpoint wiring, _patch_lm_eval, etc.). # run_lm_eval already passes --log_samples, which is what we consume. local prev_tasks_dir="${EVAL_TASKS_DIR:-}" - export EVAL_TASKS_DIR="${EVAL_TASKS_DIR:-utils/evals/${task_name}.yaml}" + export EVAL_TASKS_DIR="$yaml_path" local gen_rc=0 run_lm_eval "$@" --results-dir "$gen_dir" || gen_rc=$? export EVAL_TASKS_DIR="$prev_tasks_dir" @@ -1031,7 +1052,7 @@ run_swebench_eval() { --out-dir "$out_dir" \ --model-name "${MODEL_NAME:-$MODEL}" \ --task-name "$task_name" \ - --dataset-name "${SWEBENCH_DATASET:-princeton-nlp/SWE-bench_Lite}" \ + --dataset-name "$dataset" \ --max-workers "${SWEBENCH_MAX_WORKERS:-4}" \ --lm-eval-version "$lm_eval_version" \ ${SWEBENCH_NAMESPACE+--namespace "$SWEBENCH_NAMESPACE"} \ diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md index 752e13131..a7738defc 100644 --- a/utils/evals/EVALS.md +++ b/utils/evals/EVALS.md @@ -187,8 +187,10 @@ append_lm_eval_summary - Scoring: `utils/evals/swebench_score.py` (diff extraction → `predictions.jsonl` → `python -m swebench.harness.run_evaluation` → resolved-rate → results JSON). Offline `--report` mode skips Docker for testing. -- Knobs: `SWEBENCH_DATASET`, `SWEBENCH_TASK_NAME`, `SWEBENCH_MAX_WORKERS`, - `SWEBENCH_NAMESPACE` (pass `""` on arm/Mac), `SWEBENCH_SKIP_SCORE=true` (generate-only). +- Knobs: `SWEBENCH_TASK_NAME` (selects the YAML), `SWEBENCH_MAX_WORKERS`, + `SWEBENCH_NAMESPACE` (pass `""` on arm/Mac), `SWEBENCH_SKIP_SCORE=true` (generate-only). The + scoring dataset is derived from the YAML's `dataset_path` so generation and scoring can't diverge; + `SWEBENCH_DATASET`, if set, must match it (mismatch fails fast). - **Requires Docker + ~120 GB disk on the scoring host.** This is an MVP; the single-shot prompt and diff extraction still need tuning to reach published resolved-rates, and the `thresholds.json` entry needs calibration from a baseline run. diff --git a/utils/evals/swebench_score.py b/utils/evals/swebench_score.py index 371260443..c1e511ed0 100644 --- a/utils/evals/swebench_score.py +++ b/utils/evals/swebench_score.py @@ -39,6 +39,44 @@ ) _DIFF_GIT_RE = re.compile(r"(?:^|\n)(diff --git .*)", re.DOTALL) +# Line prefixes that belong to a (git) unified-diff body. Anything else marks +# the end of the patch. +_DIFF_LINE_PREFIXES = ( + "diff ", "index ", "--- ", "+++ ", "@@", "+", "-", " ", "\\", + "old mode ", "new mode ", "new file mode ", "deleted file mode ", + "rename ", "copy ", "similarity ", "dissimilarity ", + "Binary files ", "GIT binary patch", +) + + +def _trim_to_diff_body(text: str) -> str: + """Keep only the leading run of diff-shaped lines, dropping trailing prose. + + Models frequently emit a bare patch followed by an explanation ("Notes:", + "This fixes #123."). With no terminator that tail gets glued onto the patch + and rejected by ``git apply``, scoring the instance unresolved. Blank lines + are kept only when the diff resumes after them; a blank line followed by + non-diff text ends the patch. + """ + lines = text.splitlines() + out: list[str] = [] + i, n = 0, len(lines) + while i < n: + if lines[i].startswith(_DIFF_LINE_PREFIXES): + out.append(lines[i]) + i += 1 + continue + if lines[i] == "": + j = i + while j < n and lines[j] == "": + j += 1 + if j < n and lines[j].startswith(_DIFF_LINE_PREFIXES): + out.extend(lines[i:j]) # interior blank line(s); diff resumes + i = j + continue + break # trailing blank(s)+prose, or any other non-diff line + return "\n".join(out) + def extract_patch(text: str) -> str: """Pull a unified diff out of a model generation. @@ -50,19 +88,28 @@ def extract_patch(text: str) -> str: """ if not text: return "" + + def _finish(body: str) -> str: + body = _trim_to_diff_body(body).strip("\n") + return body + "\n" if body else "" + # 1. Prefer a fenced block that actually looks like a diff. for match in _FENCED_DIFF_RE.finditer(text): body = match.group("body") if "diff --git" in body or body.lstrip().startswith(("--- ", "+++ ")): - return body.strip("\n") + "\n" - # 2. Fall back to the first ``diff --git`` to end-of-text. + return _finish(body) + # 2. Fall back to a bare ``diff --git``, trimmed to the diff body so + # trailing prose can't corrupt the patch. git_match = _DIFF_GIT_RE.search(text) if git_match: - return git_match.group(1).strip("\n") + "\n" - # 3. Last resort: a lone fenced block, or the raw text. + trimmed = _finish(git_match.group(1)) + if trimmed: + return trimmed + # 3. Last resort: a lone fenced block (fence-bounded), or the raw text. lone = _FENCED_DIFF_RE.search(text) if lone: - return lone.group("body").strip("\n") + "\n" + body = lone.group("body").strip("\n") + return body + "\n" if body else "" return text.strip("\n") + "\n" if text.strip() else "" diff --git a/utils/evals/test_swebench_eval.py b/utils/evals/test_swebench_eval.py index 8e59975ca..067742759 100644 --- a/utils/evals/test_swebench_eval.py +++ b/utils/evals/test_swebench_eval.py @@ -40,6 +40,33 @@ def test_extract_patch_bare_diff_git(): assert "no fence" not in patch +def test_extract_patch_bare_diff_strips_trailing_prose(): + # A bare diff followed by an explanation must not glue the prose onto the + # patch (git apply would reject it -> instance scored unresolved). + text = ( + "diff --git a/x b/x\n--- a/x\n+++ b/x\n@@ -1 +1 @@\n-old\n+new\n" + "\nNotes:\nThis fixes #123.\n" + ) + patch = sbs.extract_patch(text) + assert patch.rstrip().endswith("+new") + assert "Notes:" not in patch + assert "This fixes" not in patch + + +def test_extract_patch_keeps_multi_file_and_interior_context(): + # Multiple files + a blank context line (represented as " ") stay intact. + text = ( + "```diff\n" + "diff --git a/a b/a\n@@ -1,2 +1,2 @@\n context\n-x\n+y\n" + "diff --git a/b b/b\n@@ -1 +1 @@\n-p\n+q\n" + "```\nthanks!" + ) + patch = sbs.extract_patch(text) + assert "diff --git a/a b/a" in patch + assert "diff --git a/b b/b" in patch + assert "thanks" not in patch + + def test_extract_patch_empty_when_no_diff(): assert sbs.extract_patch("") == "" # Prose with no diff markers falls back to the raw text (harness will reject). From ce358e4049bc699fc14fc9780e129dd046c45836 Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Fri, 26 Jun 2026 20:42:24 -0500 Subject: [PATCH 4/4] feat(evals): /run-evals comment command to run one eval on one recipe Adds a lightweight, eval-only PR-comment trigger (no perf sweep): /run-evals [conc] [master-config] e.g. /run-evals swebench_lite dsr1-fp4-b200-sglang 16 - benchmark_lib.sh: EVAL_FRAMEWORK env now wins over run_eval's --framework arg so an eval-only run can override the recipes' hardcoded lm-eval (default behavior unchanged when env unset). - e2e-tests.yml / benchmark-tmpl.yml / benchmark-multinode-tmpl.yml: thread new eval-framework + eval-task inputs -> EVAL_FRAMEWORK / EVAL_TASKS_DIR env. - run-evals.yml: new comment workflow (mirrors pr-comment-sweep.yml auth/SHA-pin/ reply), maps -> framework+task, infers nvidia/amd master config from the config-key HW token, builds 'test-config ... --evals-only', calls e2e-tests.yml. - test_run_eval_dispatch.py: unit tests for the env-override dispatch. NOTE: swebench scoring needs Docker on the eval runner (else SWEBENCH_SKIP_SCORE). --- .../workflows/benchmark-multinode-tmpl.yml | 12 ++ .github/workflows/benchmark-tmpl.yml | 12 ++ .github/workflows/e2e-tests.yml | 24 +++ .github/workflows/run-evals.yml | 192 ++++++++++++++++++ benchmarks/benchmark_lib.sh | 9 +- utils/evals/test_run_eval_dispatch.py | 53 +++++ 6 files changed, 300 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/run-evals.yml create mode 100644 utils/evals/test_run_eval_dispatch.py diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index 3beb246cc..b8600b049 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -91,6 +91,16 @@ on: type: string required: false default: "" + eval-framework: + description: "Eval framework (lm-eval | swebench). Empty = recipe default." + type: string + required: false + default: "" + eval-task: + description: "Eval task YAML path. Empty = framework default." + type: string + required: false + default: "" scenario-type: description: "Scenario type (fixed-seq-len or agentic-coding)" type: string @@ -143,6 +153,8 @@ env: RUN_EVAL: ${{ inputs.run-eval }} EVAL_ONLY: ${{ inputs.eval-only }} EVAL_CONC: ${{ inputs.eval-conc }} + EVAL_FRAMEWORK: ${{ inputs.eval-framework }} + EVAL_TASKS_DIR: ${{ inputs.eval-task }} SCENARIO_TYPE: ${{ inputs.scenario-type }} SCENARIO_SUBDIR: ${{ inputs.scenario-type == 'agentic-coding' && 'agentic/' || 'fixed_seq_len/' }} IS_AGENTIC: ${{ inputs.scenario-type == 'agentic-coding' && '1' || '0' }} diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index a57e89725..15c052207 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -59,6 +59,16 @@ on: type: boolean required: false default: false + eval-framework: + description: "Eval framework (lm-eval | swebench). Empty = recipe default." + type: string + required: false + default: "" + eval-task: + description: "Eval task YAML path. Empty = framework default." + type: string + required: false + default: "" random-range-ratio: required: false type: string @@ -108,6 +118,8 @@ env: DISAGG: ${{ inputs.disagg }} RUN_EVAL: ${{ inputs.run-eval }} EVAL_ONLY: ${{ inputs.eval-only }} + EVAL_FRAMEWORK: ${{ inputs.eval-framework }} + EVAL_TASKS_DIR: ${{ inputs.eval-task }} SCENARIO_TYPE: ${{ inputs.scenario-type }} SCENARIO_SUBDIR: ${{ inputs.scenario-type == 'agentic-coding' && 'agentic/' || 'fixed_seq_len/' }} IS_AGENTIC: ${{ inputs.scenario-type == 'agentic-coding' && '1' || '0' }} diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index 1b83a798a..b5b65e8cf 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -21,6 +21,16 @@ on: required: false type: string default: "" + eval-framework: + description: "Eval framework (lm-eval | swebench). Overrides the recipe default." + required: false + type: string + default: "lm-eval" + eval-task: + description: "Eval task YAML path (e.g. utils/evals/gsm8k.yaml). Empty = framework default." + required: false + type: string + default: "" workflow_call: inputs: generate-cli-command: @@ -40,6 +50,16 @@ on: required: false type: string default: "" + eval-framework: + description: "Eval framework (lm-eval | swebench). Overrides the recipe default." + required: false + type: string + default: "lm-eval" + eval-task: + description: "Eval task YAML path (e.g. utils/evals/gsm8k.yaml). Empty = framework default." + required: false + type: string + default: "" jobs: get-jobs: @@ -160,6 +180,8 @@ jobs: run-eval: true eval-only: true eval-conc: ${{ matrix.config['eval-all-concs'] && join(matrix.config.conc, ' ') || matrix.config['eval-conc'] }} + eval-framework: ${{ inputs.eval-framework }} + eval-task: ${{ inputs.eval-task }} ref: ${{ inputs.ref }} test-sweep-agentic: @@ -294,6 +316,8 @@ jobs: disagg: ${{ matrix.config.disagg }} run-eval: true eval-only: true + eval-framework: ${{ inputs.eval-framework }} + eval-task: ${{ inputs.eval-task }} ref: ${{ inputs.ref }} collect-results: diff --git a/.github/workflows/run-evals.yml b/.github/workflows/run-evals.yml new file mode 100644 index 000000000..1d6b5d06e --- /dev/null +++ b/.github/workflows/run-evals.yml @@ -0,0 +1,192 @@ +name: Slash Command Run Evals +run-name: "/run-evals PR #${{ github.event.issue.number }}" + +# Comment-triggered, eval-only run of ONE eval on ONE recipe (no perf sweep). +# Usage in a PR comment: +# /run-evals [conc] [master-config] +# where is one of: gsm8k | gpqa | swebench_lite (alias: swebench). +# Example: /run-evals swebench_lite dsr1-fp4-b200-sglang 16 +# Mirrors pr-comment-sweep.yml; differs only in parsing + the eval mapping it +# forwards to e2e-tests.yml (eval-framework / eval-task). + +on: + issue_comment: + types: [created] + +permissions: + contents: read + issues: write + pull-requests: write + +jobs: + get-jobs: + # Only run for PR comments that start with /run-evals + if: ${{ github.event.issue.pull_request && startsWith(github.event.comment.body, '/run-evals') }} + runs-on: ubuntu-latest + outputs: + pr-number: ${{ steps.parse.outputs.pr-number }} + generator-args: ${{ steps.parse.outputs.generator-args }} + eval-framework: ${{ steps.parse.outputs.eval-framework }} + eval-task: ${{ steps.parse.outputs.eval-task }} + author-can-bypass: ${{ steps.auth.outputs.can-bypass }} + # Immutable ref (commit SHA) to prevent TOCTOU on refs/pull//head + ref: ${{ steps.ref_comment.outputs.ref }} + steps: + - name: Parse PR comment (/run-evals [conc] [master]) + id: parse + if: ${{ github.event_name == 'issue_comment' && github.event.issue.pull_request && startsWith(github.event.comment.body, '/run-evals') }} + shell: bash + env: + BODY: ${{ github.event.comment.body }} + PR_NUMBER: ${{ github.event.issue.number }} + run: | + set -euo pipefail + # Require /run-evals at the start of a line. + cmd_line=$(printf "%s" "$BODY" | awk '/^\/run-evals/{print; exit}') + if [[ -z "$cmd_line" ]]; then + echo "No /run-evals command found at comment start" >&2 + exit 1 + fi + # Positional args after the command. + read -ra parts <<< "${cmd_line#/run-evals}" + eval_name="${parts[0]:-}" + config_key="${parts[1]:-}" + conc="${parts[2]:-}" + master_override="${parts[3]:-}" + if [[ -z "$eval_name" || -z "$config_key" ]]; then + echo "usage: /run-evals [conc] [master-config]" >&2 + echo "valid evals: gsm8k | gpqa | swebench_lite" >&2 + exit 1 + fi + + # Map -> (framework, task YAML). + case "$eval_name" in + gsm8k) framework="lm-eval"; task="utils/evals/gsm8k.yaml" ;; + gpqa|gpqa_diamond) framework="lm-eval"; task="utils/evals/gpqa_diamond.yaml" ;; + swebench|swebench_lite) framework="swebench"; task="utils/evals/swebench_lite.yaml" ;; + *) + echo "unknown eval '$eval_name' (valid: gsm8k, gpqa, swebench_lite)" >&2 + exit 1 + ;; + esac + + if [[ -n "$conc" && ! "$conc" =~ ^[1-9][0-9]*$ ]]; then + echo "conc must be a positive integer, got '$conc'" >&2 + exit 1 + fi + + # Pick the platform master config from the config-key's hardware token, + # unless an explicit 4th arg overrides it. + if [[ -n "$master_override" ]]; then + master="$master_override" + elif [[ "$config_key" =~ (b200|b300|h100|h200|gb200|gb300) ]]; then + master=".github/configs/nvidia-master.yaml" + elif [[ "$config_key" =~ (mi300x|mi325x|mi355x) ]]; then + master=".github/configs/amd-master.yaml" + else + echo "cannot infer platform from config-key '$config_key'; pass the master config path as a 4th arg" >&2 + exit 1 + fi + + gen="test-config --config-files ${master} --config-keys ${config_key} --evals-only" + if [[ -n "$conc" ]]; then + gen="${gen} --conc ${conc}" + fi + + { + echo "generator-args=${gen}" + echo "eval-framework=${framework}" + echo "eval-task=${task}" + echo "pr-number=${PR_NUMBER}" + } >> "$GITHUB_OUTPUT" + + - name: Check author permissions + id: auth + if: ${{ github.event_name == 'issue_comment' && github.event.issue.pull_request }} + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 + with: + script: | + const owner = context.repo.owner; + const repo = context.repo.repo; + const username = context.payload.comment?.user?.login; + let permission = 'none'; + try { + const res = await github.rest.repos.getCollaboratorPermissionLevel({ owner, repo, username }); + permission = res.data?.permission || 'none'; + } catch (e) { + permission = 'none'; + } + const canBypass = ['admin','maintain','write'].includes(permission); + core.info(`Author ${username} permission: ${permission}; bypass=${canBypass}`); + core.setOutput('can-bypass', canBypass ? 'true' : 'false'); + + # ---- PR SHA pinning ---- + - name: Resolve immutable PR ref (pin to head SHA) + id: ref_comment + if: ${{ github.event_name == 'issue_comment' && github.event.issue.pull_request && startsWith(github.event.comment.body, '/run-evals') }} + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 + with: + script: | + const owner = context.repo.owner; + const repo = context.repo.repo; + const pr = context.issue.number; + const res = await github.rest.pulls.get({ owner, repo, pull_number: pr }); + const sha = res.data.head.sha; + core.info(`Resolved PR #${pr} head SHA: ${sha}`); + core.setOutput('ref', sha); + + - name: Reply with run link + if: ${{ github.event_name == 'issue_comment' && startsWith(github.event.comment.body, '/run-evals') && github.repository_owner == 'SemiAnalysisAI' }} + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 + continue-on-error: true + env: + RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + AUTHOR: ${{ github.event.comment.user.login }} + GEN_CMD: ${{ steps.parse.outputs.generator-args }} + EVAL_FRAMEWORK: ${{ steps.parse.outputs.eval-framework }} + CAN_BYPASS: ${{ steps.auth.outputs.can-bypass }} + PINNED_REF: ${{ steps.ref_comment.outputs.ref }} + with: + github-token: ${{ github.token }} + script: | + const owner = context.repo.owner; + const repo = context.repo.repo; + const issue_number = context.issue.number; + const runUrl = process.env.RUN_URL; + const author = process.env.AUTHOR; + const genCmd = process.env.GEN_CMD || ''; + const framework = process.env.EVAL_FRAMEWORK || ''; + const canBypass = (process.env.CAN_BYPASS || '').toLowerCase() === 'true'; + const pinned = process.env.PINNED_REF || ''; + const shortSha = pinned ? pinned.slice(0, 7) : ''; + const approvalMsg = canBypass ? 'Approval: not required (trusted collaborator).' : "Approval: required in environment 'Outside Collaborator E2E Test'."; + const body = `@${author} Kicking off an eval-only run (framework: \`${framework}\`).\n\nRun: ${runUrl}\nCommand: \`${genCmd}\`\nPinned ref: \`${shortSha}\`\n${approvalMsg}`; + await github.rest.issues.createComment({ owner, repo, issue_number, body }); + + approval: + needs: get-jobs + if: ${{ github.event_name == 'issue_comment' && needs.get-jobs.outputs.pr-number != '' && needs.get-jobs.outputs.generator-args != '' && needs.get-jobs.outputs.author-can-bypass != 'true' }} + runs-on: ubuntu-latest + name: approval + environment: Outside Collaborator E2E Test + steps: + - run: echo "approved" + + validate: + needs: [get-jobs, approval] + # always() is required to evaluate this condition when 'approval' is skipped (trusted author) + if: ${{ always() && needs.get-jobs.result == 'success' && needs.get-jobs.outputs.pr-number != '' && needs.get-jobs.outputs.generator-args != '' && (needs.get-jobs.outputs.author-can-bypass == 'true' || needs.approval.result == 'success') }} + # Concurrency at job level so non-/run-evals comments don't cancel active runs + concurrency: + group: "run-evals-PR#${{ needs.get-jobs.outputs.pr-number }}" + cancel-in-progress: true + uses: ./.github/workflows/e2e-tests.yml + name: validate + secrets: inherit + with: + generate-cli-command: ${{ needs.get-jobs.outputs.generator-args }} + eval-framework: ${{ needs.get-jobs.outputs.eval-framework }} + eval-task: ${{ needs.get-jobs.outputs.eval-task }} + test-name: PR #${{ needs.get-jobs.outputs.pr-number }} /run-evals + # Use pinned SHA to prevent TOCTOU on refs/pull//head + ref: ${{ needs.get-jobs.outputs.ref }} diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index fb9d35f64..060b0e926 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -1069,15 +1069,20 @@ run_swebench_eval() { # ------------------------------ run_eval() { - local framework="${EVAL_FRAMEWORK:-lm-eval}" + # EVAL_FRAMEWORK (env) wins over the --framework arg so an eval-only run + # (e.g. the /run-evals command) can override the recipe scripts' hardcoded + # `--framework lm-eval`. With the env unset, the CLI arg (else lm-eval) is + # used exactly as before. + local cli_framework="" local forwarded=() while [[ $# -gt 0 ]]; do case "$1" in - --framework) framework="$2"; shift 2 ;; + --framework) cli_framework="$2"; shift 2 ;; *) forwarded+=("$1"); shift ;; esac done + local framework="${EVAL_FRAMEWORK:-${cli_framework:-lm-eval}}" # Compute EVAL_MAX_MODEL_LEN if not already set by the calling script if [ -z "${EVAL_MAX_MODEL_LEN:-}" ]; then diff --git a/utils/evals/test_run_eval_dispatch.py b/utils/evals/test_run_eval_dispatch.py new file mode 100644 index 000000000..44a5d785a --- /dev/null +++ b/utils/evals/test_run_eval_dispatch.py @@ -0,0 +1,53 @@ +"""run_eval framework dispatch: EVAL_FRAMEWORK (env) overrides the --framework arg. + +This is what lets `/run-evals swebench_lite ...` run swebench even though every +recipe script hardcodes `run_eval --framework lm-eval`. With the env unset, the +CLI arg (else lm-eval) is used as before. +""" + +import os +import subprocess +from pathlib import Path + +BENCHMARK_LIB = Path(__file__).resolve().parents[2] / "benchmarks" / "benchmark_lib.sh" + +# Stub the framework runners so dispatch is observable without a server/Docker, +# and pin EVAL_MAX_MODEL_LEN so run_eval skips context computation. +_SCRIPT = r''' +source "$BENCHMARK_LIB" +run_lm_eval() { echo "DISPATCH=lm-eval"; } +run_swebench_eval() { echo "DISPATCH=swebench"; } +export EVAL_MAX_MODEL_LEN=16384 +unset EVAL_CONCURRENT_REQUESTS +run_eval --framework "$CLI_FW" --port 8888 +''' + + +def _dispatch(cli_fw: str, env_fw: str | None) -> str: + env = {**os.environ, "BENCHMARK_LIB": str(BENCHMARK_LIB), "CLI_FW": cli_fw} + env.pop("EVAL_FRAMEWORK", None) + if env_fw is not None: + env["EVAL_FRAMEWORK"] = env_fw + res = subprocess.run( + ["bash", "-c", _SCRIPT], env=env, text=True, capture_output=True, check=True + ) + return res.stdout + + +def test_env_framework_overrides_cli_arg(): + # recipe passes --framework lm-eval, but EVAL_FRAMEWORK=swebench wins. + assert "DISPATCH=swebench" in _dispatch("lm-eval", "swebench") + + +def test_cli_arg_used_when_env_unset(): + assert "DISPATCH=lm-eval" in _dispatch("lm-eval", None) + + +def test_swebench_via_cli_arg_when_env_unset(): + assert "DISPATCH=swebench" in _dispatch("swebench", None) + + +def test_empty_env_falls_back_to_cli_arg(): + # An empty EVAL_FRAMEWORK (how the template passes it when unset) must not + # force anything -- the CLI arg still wins. + assert "DISPATCH=lm-eval" in _dispatch("lm-eval", "")