From 27d164103995aea671631cfc0011479afd86f8b8 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Fri, 26 Jun 2026 18:53:23 -0500
Subject: [PATCH 1/4] feat(evals): add SWE-bench Lite accuracy eval

lm-eval cannot score SWE-bench (no repo-level Docker test executor), so this
reuses lm-eval for patch *generation* and adds a scoring step that runs the
official swebench harness, emitting an lm-eval-shaped results JSON so the
existing collect/validate pipeline works unchanged.

- swebench_lite.yaml: lm-eval generate_until task over SWE-bench Lite
- swebench_score.py: diff extraction -> predictions.jsonl -> swebench harness
  -> resolved-rate -> lm-eval-shaped results; offline --report/--predictions-only
- benchmark_lib.sh: run_swebench_eval + --framework swebench dispatch
- collect_eval_results.py: recognize 'resolved' filter as the primary score
- thresholds.json: placeholder swebench_lite entry (needs calibration)
- EVALS.md: document the new framework + task
- test_swebench_eval.py: unit + integration tests
---
 benchmarks/benchmark_lib.sh       |  78 +++++++
 utils/collect_eval_results.py     |   4 +-
 utils/evals/EVALS.md              |  25 +++
 utils/evals/swebench_lite.yaml    |  56 +++++
 utils/evals/swebench_score.py     | 340 ++++++++++++++++++++++++++++++
 utils/evals/test_swebench_eval.py | 182 ++++++++++++++++
 utils/evals/thresholds.json       |   3 +-
 7 files changed, 686 insertions(+), 2 deletions(-)
 create mode 100644 utils/evals/swebench_lite.yaml
 create mode 100644 utils/evals/swebench_score.py
 create mode 100644 utils/evals/test_swebench_eval.py

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 95e063a3d..70353e65d 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -966,6 +966,83 @@ META
     echo "Moved eval artifacts to: $(pwd)"
 }
 
+# ------------------------------
+# SWE-bench eval helpers
+# ------------------------------
+
+# Run the SWE-bench Lite eval: generate patches with lm-eval, then score them
+# with the official swebench Docker harness. lm-eval cannot score SWE-bench
+# itself (no repo-level test executor), so we reuse it only for generation and
+# emit an lm-eval-shaped results JSON from swebench_score.py so the rest of the
+# pipeline (append_lm_eval_summary / collect / validate) is unchanged.
+#
+# Env knobs:
+#   SWEBENCH_DATASET       (default princeton-nlp/SWE-bench_Lite)
+#   SWEBENCH_TASK_NAME     (default swebench_lite)
+#   SWEBENCH_MAX_WORKERS   (default 4) harness Docker workers
+#   SWEBENCH_NAMESPACE     pass "" on arm/Mac to build images locally
+#   SWEBENCH_SKIP_SCORE    "true" => generate + stage predictions only, no Docker
+#                          (for runners without Docker; score elsewhere)
+run_swebench_eval() {
+    local out_dir="${EVAL_RESULT_DIR:-$(mktemp -d /tmp/eval_out-XXXXXX)}"
+    local task_name="${SWEBENCH_TASK_NAME:-swebench_lite}"
+    local gen_dir
+    gen_dir=$(mktemp -d /tmp/swebench_gen-XXXXXX)
+
+    # 1. Generation via lm-eval (reuses endpoint wiring, _patch_lm_eval, etc.).
+    #    run_lm_eval already passes --log_samples, which is what we consume.
+    local prev_tasks_dir="${EVAL_TASKS_DIR:-}"
+    export EVAL_TASKS_DIR="${EVAL_TASKS_DIR:-utils/evals/${task_name}.yaml}"
+    local gen_rc=0
+    run_lm_eval "$@" --results-dir "$gen_dir" || gen_rc=$?
+    export EVAL_TASKS_DIR="$prev_tasks_dir"
+    if [ "$gen_rc" -ne 0 ]; then
+        echo "ERROR: swebench generation (lm-eval) failed with $gen_rc" >&2
+        rm -rf "$gen_dir" 2>/dev/null || true
+        return "$gen_rc"
+    fi
+
+    # Preserve generations as artifacts alongside the scored results.
+    mkdir -p "$out_dir"
+    find "$gen_dir" -name 'samples_*.jsonl' -exec cp -f {} "$out_dir"/ \; 2>/dev/null || true
+    export EVAL_RESULT_DIR="$out_dir"
+
+    local lm_eval_version
+    lm_eval_version=$(python3 -c 'import lm_eval; print(lm_eval.__version__)' 2>/dev/null || echo unknown)
+
+    if [ "${SWEBENCH_SKIP_SCORE:-false}" = "true" ]; then
+        # Generation-only mode: emit predictions, defer Docker scoring elsewhere.
+        # TODO(alec): wire the separate scoring job (Modal / sb-cli / CPU runner).
+        local skip_rc=0
+        python3 utils/evals/swebench_score.py \
+            --samples-dir "$gen_dir" --out-dir "$out_dir" \
+            --model-name "${MODEL_NAME:-$MODEL}" --task-name "$task_name" \
+            --predictions-only || skip_rc=$?
+        echo "SWEBENCH_SKIP_SCORE=true: staged predictions only (no resolved-rate)." >&2
+        rm -rf "$gen_dir" 2>/dev/null || true
+        return "$skip_rc"
+    fi
+
+    # 2. Score with the official swebench harness (requires Docker) and emit the
+    #    lm-eval-shaped results JSON into EVAL_RESULT_DIR.
+    local score_rc=0
+    python3 utils/evals/swebench_score.py \
+        --samples-dir "$gen_dir" \
+        --out-dir "$out_dir" \
+        --model-name "${MODEL_NAME:-$MODEL}" \
+        --task-name "$task_name" \
+        --dataset-name "${SWEBENCH_DATASET:-princeton-nlp/SWE-bench_Lite}" \
+        --max-workers "${SWEBENCH_MAX_WORKERS:-4}" \
+        --lm-eval-version "$lm_eval_version" \
+        ${SWEBENCH_NAMESPACE+--namespace "$SWEBENCH_NAMESPACE"} \
+        || score_rc=$?
+    rm -rf "$gen_dir" 2>/dev/null || true
+    if [ "$score_rc" -ne 0 ]; then
+        echo "ERROR: swebench scoring failed with $score_rc" >&2
+        return "$score_rc"
+    fi
+}
+
 # ------------------------------
 # Unified eval entrypoint
 # ------------------------------
@@ -1052,6 +1129,7 @@ run_eval() {
     local eval_rc=0
     case "$framework" in
         lm-eval|lm_eval) run_lm_eval "${forwarded[@]}" || eval_rc=$? ;;
+        swebench)        run_swebench_eval "${forwarded[@]}" || eval_rc=$? ;;
         *)               echo "Unknown framework '${framework}'"; eval_rc=1 ;;
     esac
 
diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py
index 194fa4acb..f98a6f7c4 100644
--- a/utils/collect_eval_results.py
+++ b/utils/collect_eval_results.py
@@ -141,7 +141,9 @@ def get_val_se(filter_name: str) -> Tuple[Optional[float], Optional[float]]:
             # Extract metrics for each filter
             for f in filter_list:
                 fname = f['name']
-                if 'strict' in fname:
+                # 'resolved' is SWE-bench's resolved-rate (swebench_score.py);
+                # treat it as the primary/strict score so it populates `score`.
+                if 'strict' in fname or 'resolved' in fname:
                     strict_val, strict_se = get_val_se(fname)
                 elif 'flex' in fname or 'extract' in fname:
                     flex_val, flex_se = get_val_se(fname)
diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md
index 7ff878dce..752e13131 100644
--- a/utils/evals/EVALS.md
+++ b/utils/evals/EVALS.md
@@ -169,7 +169,32 @@ The codebase patches lm-eval compatibility via `_patch_lm_eval`:
 1. Reasoning token handling: extracts `reasoning_content` when `message.content` is empty.
 2. TRT compatibility: avoids injecting `{"type": "text"}` for non-HF tokenizers.
 
+### SWE-bench Lite (`--framework swebench`)
+
+SWE-bench is **not** a `generate_until` QA task — it requires applying the model's
+patch to a repo and running tests in Docker, which lm-eval cannot do. So it runs
+through a dedicated framework that reuses lm-eval for *generation* only, then scores
+with the official `swebench` harness and emits an lm-eval-shaped results JSON
+(metric `exact_match,resolved` = resolved-rate) so collect/validate work unchanged.
+
+```bash
+run_eval --framework swebench --port "$PORT"   # generation (lm-eval) -> scoring (swebench)
+append_lm_eval_summary
+```
+
+- Task: `utils/evals/swebench_lite.yaml` (generation) — SWE-bench Lite, the ~300-instance curated
+  quick-eval subset (no difficulty filter needed; Lite is already the lightweight set).
+- Scoring: `utils/evals/swebench_score.py` (diff extraction → `predictions.jsonl` →
+  `python -m swebench.harness.run_evaluation` → resolved-rate → results JSON). Offline
+  `--report` mode skips Docker for testing.
+- Knobs: `SWEBENCH_DATASET`, `SWEBENCH_TASK_NAME`, `SWEBENCH_MAX_WORKERS`,
+  `SWEBENCH_NAMESPACE` (pass `""` on arm/Mac), `SWEBENCH_SKIP_SCORE=true` (generate-only).
+- **Requires Docker + ~120 GB disk on the scoring host.** This is an MVP; the single-shot prompt and
+  diff extraction still need tuning to reach published resolved-rates, and the `thresholds.json` entry
+  needs calibration from a baseline run.
+
 ## Task files
 The following files are task definitions from lm-eval; more information on changes lives within the files:
 - `utils/evals/gsm8k.yaml`
 - `utils/evals/gpqa_diamond.yaml`
+- `utils/evals/swebench_lite.yaml` (generation only; scored by `swebench_score.py`)
diff --git a/utils/evals/swebench_lite.yaml b/utils/evals/swebench_lite.yaml
new file mode 100644
index 000000000..4633af462
--- /dev/null
+++ b/utils/evals/swebench_lite.yaml
@@ -0,0 +1,56 @@
+# SWE-bench Lite -- GENERATION ONLY.
+#
+# Lite is the ~300-instance curated subset for quick evals (no difficulty labels;
+# it's already the lightweight set, so no filtering is needed -- unlike Verified,
+# which carries a `difficulty` field).
+#
+# lm-eval is used purely to drive the served OpenAI-compatible endpoint and dump
+# one candidate patch per instance via --log_samples. The metric below is a
+# PLACEHOLDER that lm-eval computes but we ignore: the real resolved-rate comes
+# from utils/evals/swebench_score.py running the official `swebench` harness,
+# which then emits an lm-eval-shaped results JSON for collect/validate.
+#
+# Run it through the dedicated framework, not bare lm-eval:
+#   run_eval --framework swebench --port "$PORT"
+# which wires generation -> scoring. Bare `--tasks swebench_lite.yaml` would
+# produce only the meaningless placeholder metric.
+task: swebench_lite
+dataset_path: princeton-nlp/SWE-bench_Lite  # also mirrored at SWE-bench/SWE-bench_Lite
+output_type: generate_until
+test_split: test
+
+doc_to_text: |
+  You are an expert software engineer fixing a real GitHub issue in the
+  repository `{{repo}}` at commit {{base_commit}}.
+
+  <issue>
+  {{problem_statement}}
+  </issue>
+
+  Respond with ONLY a unified diff (a git patch) that resolves the issue, using
+  real repository file paths. Do not include explanations. Wrap the patch in a
+  single fenced block exactly like:
+
+  ```diff
+  diff --git a/path/to/file.py b/path/to/file.py
+  --- a/path/to/file.py
+  +++ b/path/to/file.py
+  @@ ... @@
+  ```
+# The gold patch is the nominal target. lm-eval's exact_match against it is
+# meaningless for patches (overwritten by the harness score); it only exists so
+# generate_until has a target + a metric and does not error.
+doc_to_target: "{{patch}}"
+
+generation_kwargs:
+  until: []
+  do_sample: false
+  temperature: 0.0
+
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+
+metadata:
+  version: 0.1
diff --git a/utils/evals/swebench_score.py b/utils/evals/swebench_score.py
new file mode 100644
index 000000000..edf8ef212
--- /dev/null
+++ b/utils/evals/swebench_score.py
@@ -0,0 +1,340 @@
+#!/usr/bin/env python3
+"""Score SWE-bench patches generated by lm-eval and emit an lm-eval-shaped result.
+
+Pipeline:
+
+  1. Read lm-eval ``--log_samples`` output (samples_*.jsonl): one candidate per
+     SWE-bench instance.
+  2. Extract a unified diff from each model generation.
+  3. Write a ``predictions.jsonl`` in the format the official ``swebench`` harness
+     expects: ``{instance_id, model_name_or_path, model_patch}``.
+  4. Run ``python -m swebench.harness.run_evaluation`` (Docker) to get the
+     resolved-rate -- unless ``--no-run``/``--report`` is given (offline/testing).
+  5. Emit a results JSON shaped like an lm-eval result so the existing
+     ``collect_eval_results.py`` / ``validate_scores.py`` ingest it unchanged.
+     The metric is published as ``exact_match,resolved`` = resolved-rate.
+
+The harness needs Docker + lots of disk and is NOT runnable on this dev Mac, so
+the Docker step is isolated behind ``--no-run`` for local testing. TODO(alec):
+exercise the real ``--run`` path on a runner.
+"""
+
+import argparse
+import json
+import math
+import re
+import subprocess
+import sys
+from pathlib import Path
+from typing import Any, Iterator, Optional
+
+DEFAULT_DATASET = "princeton-nlp/SWE-bench_Lite"
+DEFAULT_TASK = "swebench_lite"
+
+# A unified diff, optionally inside a ```diff / ```patch fence. We try fenced
+# first (what the prompt asks for), then a bare ``diff --git`` slice.
+_FENCED_DIFF_RE = re.compile(
+    r"```(?:diff|patch)?\s*\n(?P<body>.*?)```",
+    re.DOTALL | re.IGNORECASE,
+)
+_DIFF_GIT_RE = re.compile(r"(?:^|\n)(diff --git .*)", re.DOTALL)
+
+
+def extract_patch(text: str) -> str:
+    """Pull a unified diff out of a model generation.
+
+    Best-effort and deliberately conservative -- a wrong extraction just means
+    that instance is unresolved, never a crash. Diff-extraction quality is a
+    primary tuning lever (TODO(alec)): bad fences here directly suppress the
+    resolved-rate.
+    """
+    if not text:
+        return ""
+    # 1. Prefer a fenced block that actually looks like a diff.
+    for match in _FENCED_DIFF_RE.finditer(text):
+        body = match.group("body")
+        if "diff --git" in body or body.lstrip().startswith(("--- ", "+++ ")):
+            return body.strip("\n") + "\n"
+    # 2. Fall back to the first ``diff --git`` to end-of-text.
+    git_match = _DIFF_GIT_RE.search(text)
+    if git_match:
+        return git_match.group(1).strip("\n") + "\n"
+    # 3. Last resort: a lone fenced block, or the raw text.
+    lone = _FENCED_DIFF_RE.search(text)
+    if lone:
+        return lone.group("body").strip("\n") + "\n"
+    return text.strip("\n") + "\n" if text.strip() else ""
+
+
+def _response_text(record: dict) -> str:
+    """Extract the model's text from one lm-eval sample record.
+
+    lm-eval's sample schema has drifted across versions; be tolerant.
+    TODO(alec): confirm against the pinned harness's real samples_*.jsonl.
+    """
+    for key in ("filtered_resps", "resps"):
+        val = record.get(key)
+        while isinstance(val, (list, tuple)) and val:
+            val = val[0]
+        if isinstance(val, str) and val.strip():
+            return val
+    return ""
+
+
+def _instance_id(record: dict) -> Optional[str]:
+    doc = record.get("doc")
+    if isinstance(doc, dict):
+        for key in ("instance_id", "instance", "id"):
+            val = doc.get(key)
+            if isinstance(val, str) and val:
+                return val
+    # Some versions hoist doc fields to the top level.
+    val = record.get("instance_id")
+    return val if isinstance(val, str) and val else None
+
+
+def iter_samples(samples_dir: Path) -> Iterator[dict]:
+    """Yield JSON records from every samples_*.jsonl under ``samples_dir``."""
+    files = sorted(samples_dir.rglob("samples_*.jsonl"))
+    if not files:
+        raise FileNotFoundError(
+            f"no samples_*.jsonl found under {samples_dir} -- did lm-eval run "
+            "with --log_samples?"
+        )
+    for path in files:
+        with path.open() as fh:
+            for line in fh:
+                line = line.strip()
+                if line:
+                    yield json.loads(line)
+
+
+def build_predictions(samples_dir: Path, model_name: str) -> list[dict]:
+    """Turn lm-eval samples into swebench prediction rows (dedup by instance)."""
+    by_instance: dict[str, dict] = {}
+    skipped = 0
+    for record in iter_samples(samples_dir):
+        instance_id = _instance_id(record)
+        if not instance_id:
+            skipped += 1
+            continue
+        patch = extract_patch(_response_text(record))
+        # Last write wins; SWE-bench is single-attempt so there should be one
+        # record per instance anyway.
+        by_instance[instance_id] = {
+            "instance_id": instance_id,
+            "model_name_or_path": model_name,
+            "model_patch": patch,
+        }
+    if skipped:
+        print(f"WARN: skipped {skipped} sample(s) with no instance_id", file=sys.stderr)
+    if not by_instance:
+        raise ValueError("no usable predictions extracted from samples")
+    return list(by_instance.values())
+
+
+def write_predictions(predictions: list[dict], out_path: Path) -> None:
+    with out_path.open("w") as fh:
+        for row in predictions:
+            fh.write(json.dumps(row) + "\n")
+
+
+def run_harness(
+    predictions_path: Path,
+    dataset_name: str,
+    run_id: str,
+    work_dir: Path,
+    max_workers: int,
+    namespace: Optional[str],
+) -> None:
+    """Invoke the official swebench Docker harness (requires Docker)."""
+    cmd = [
+        sys.executable, "-m", "swebench.harness.run_evaluation",
+        "--dataset_name", dataset_name,
+        "--predictions_path", str(predictions_path),
+        "--run_id", run_id,
+        "--max_workers", str(max_workers),
+    ]
+    if namespace is not None:
+        # On arm/Mac (and to force local image builds) pass --namespace ''.
+        cmd += ["--namespace", namespace]
+    print(f"[swebench] running: {' '.join(cmd)}", flush=True)
+    subprocess.run(cmd, cwd=str(work_dir), check=True)
+
+
+def find_report(work_dir: Path, model_name: str, run_id: str) -> Path:
+    """Locate the harness report JSON, tolerant to known layout variants."""
+    sanitized = model_name.replace("/", "__")
+    candidates = [
+        work_dir / f"{sanitized}.{run_id}.json",          # classic: <model>.<run_id>.json
+        work_dir / f"{model_name}.{run_id}.json",
+        work_dir / "evaluation_results" / "results.json",  # newer layout
+    ]
+    for path in candidates:
+        if path.exists():
+            return path
+    # Broad fallback: any *.json mentioning resolved/total at the top level.
+    for path in sorted(work_dir.rglob("*.json")):
+        try:
+            data = json.loads(path.read_text())
+        except (json.JSONDecodeError, OSError):
+            continue
+        if isinstance(data, dict) and (
+            "resolved_instances" in data or "resolved_ids" in data
+        ):
+            return path
+    raise FileNotFoundError(
+        f"could not locate a swebench report under {work_dir} "
+        f"(looked for {[str(c) for c in candidates]})"
+    )
+
+
+def parse_resolved(report: dict) -> tuple[int, int]:
+    """Return (resolved, total) from a harness report, tolerant to key variants.
+
+    Denominator is the full instance count (leaderboard convention:
+    resolved / total), not just completed instances.
+    """
+    resolved: Optional[int] = None
+    for key in ("resolved_instances", "resolved", "num_resolved"):
+        if isinstance(report.get(key), int):
+            resolved = report[key]
+            break
+    if resolved is None and isinstance(report.get("resolved_ids"), list):
+        resolved = len(report["resolved_ids"])
+
+    total: Optional[int] = None
+    for key in ("total_instances", "completed_instances", "submitted_instances"):
+        val = report.get(key)
+        if isinstance(val, int) and val > 0:
+            total = val
+            break
+    if total is None:
+        for key in ("completed_ids", "submitted_ids"):
+            if isinstance(report.get(key), list) and report[key]:
+                total = len(report[key])
+                break
+
+    if resolved is None or total is None or total <= 0:
+        raise ValueError(
+            f"could not parse resolved/total from report keys {sorted(report)}"
+        )
+    return resolved, total
+
+
+def build_results_json(
+    task: str,
+    resolved: int,
+    total: int,
+    model_name: str,
+    lm_eval_version: str,
+    report: Optional[dict],
+) -> dict:
+    """Shape the resolved-rate as an lm-eval result.
+
+    Published as ``exact_match,resolved`` so validate_scores (prefix
+    ``exact_match,``) gates it and collect_eval_results surfaces it as ``score``.
+    """
+    rate = resolved / total
+    stderr = math.sqrt(rate * (1.0 - rate) / total) if total else 0.0
+    return {
+        "lm_eval_version": lm_eval_version,
+        "model_name": model_name,
+        "results": {
+            task: {
+                "alias": task,
+                "exact_match,resolved": rate,
+                "exact_match_stderr,resolved": stderr,
+            }
+        },
+        "configs": {
+            task: {
+                "metric_list": [{"metric": "exact_match"}],
+                "filter_list": [{"name": "resolved"}],
+            }
+        },
+        "n-samples": {task: {"effective": total, "original": total}},
+        # Debugging passthrough; ignored by collectors (no lm_eval_version here).
+        "swebench": {
+            "resolved": resolved,
+            "total": total,
+            "resolved_rate": rate,
+            "report": report,
+        },
+    }
+
+
+def main(argv: Optional[list[str]] = None) -> int:
+    parser = argparse.ArgumentParser(description="Score SWE-bench patches from lm-eval samples")
+    parser.add_argument("--samples-dir", required=True, help="dir containing lm-eval samples_*.jsonl")
+    parser.add_argument("--out-dir", required=True, help="dir to write predictions + results JSON")
+    parser.add_argument("--model-name", required=True, help="served model name (model_name_or_path)")
+    parser.add_argument("--dataset-name", default=DEFAULT_DATASET)
+    parser.add_argument("--task-name", default=DEFAULT_TASK)
+    parser.add_argument("--run-id", default=None, help="harness run id (default: task name)")
+    parser.add_argument("--max-workers", type=int, default=4)
+    parser.add_argument(
+        "--namespace", default=None,
+        help="swebench --namespace value (pass '' on arm/Mac to build images locally)",
+    )
+    parser.add_argument("--lm-eval-version", default="unknown")
+    parser.add_argument(
+        "--predictions-only", action="store_true",
+        help="write predictions.jsonl and stop (no scoring; score elsewhere)",
+    )
+    parser.add_argument(
+        "--no-run", action="store_true",
+        help="skip the Docker harness; requires --report (offline/testing)",
+    )
+    parser.add_argument(
+        "--report", default=None,
+        help="path to a pre-computed harness report JSON (implies --no-run)",
+    )
+    args = parser.parse_args(argv)
+
+    samples_dir = Path(args.samples_dir)
+    out_dir = Path(args.out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    run_id = args.run_id or args.task_name
+
+    # 1-3. samples -> predictions.jsonl
+    predictions = build_predictions(samples_dir, args.model_name)
+    predictions_path = out_dir / "predictions.jsonl"
+    write_predictions(predictions, predictions_path)
+    print(f"[swebench] wrote {len(predictions)} predictions -> {predictions_path}")
+
+    if args.predictions_only:
+        print("[swebench] predictions-only: skipping scoring (score elsewhere)")
+        return 0
+
+    # 4. score (Docker) or load an existing report
+    if args.report:
+        report = json.loads(Path(args.report).read_text())
+    elif args.no_run:
+        print("ERROR: --no-run requires --report", file=sys.stderr)
+        return 1
+    else:
+        run_harness(
+            predictions_path, args.dataset_name, run_id,
+            out_dir, args.max_workers, args.namespace,
+        )
+        report = json.loads(find_report(out_dir, args.model_name, run_id).read_text())
+
+    resolved, total = parse_resolved(report)
+
+    # 5. emit lm-eval-shaped results
+    results = build_results_json(
+        args.task_name, resolved, total, args.model_name,
+        args.lm_eval_version, report,
+    )
+    results_path = out_dir / f"results_{args.task_name}.json"
+    results_path.write_text(json.dumps(results, indent=2))
+    print(
+        f"[swebench] {args.task_name}: resolved {resolved}/{total} "
+        f"= {resolved / total:.4f} -> {results_path}"
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/utils/evals/test_swebench_eval.py b/utils/evals/test_swebench_eval.py
new file mode 100644
index 000000000..8e59975ca
--- /dev/null
+++ b/utils/evals/test_swebench_eval.py
@@ -0,0 +1,182 @@
+"""Tests for the SWE-bench Lite eval MVP (generation -> scoring -> lm-eval shape).
+
+Pure-stdlib paths (extract_patch, predictions, report parsing, results shape)
+run on any interpreter. The dataset filter and the collect/validate integration
+guard on optional deps / interpreter version so the file imports cleanly even on
+the macOS system python 3.9 used for local spot-checks.
+"""
+
+import json
+import sys
+from pathlib import Path
+
+import pytest
+
+sys.path.insert(0, str(Path(__file__).resolve().parent))          # utils/evals
+sys.path.insert(0, str(Path(__file__).resolve().parents[1]))      # utils
+
+import swebench_score as sbs
+
+
+# --- diff extraction -------------------------------------------------------
+
+def test_extract_patch_from_diff_fence():
+    text = (
+        "Here is the fix:\n\n```diff\n"
+        "diff --git a/f.py b/f.py\n--- a/f.py\n+++ b/f.py\n"
+        "@@ -1 +1 @@\n-old\n+new\n```\nDone."
+    )
+    patch = sbs.extract_patch(text)
+    assert patch.startswith("diff --git a/f.py b/f.py")
+    assert patch.endswith("\n")
+    assert "Here is the fix" not in patch
+    assert "Done." not in patch
+
+
+def test_extract_patch_bare_diff_git():
+    text = "no fence\ndiff --git a/x b/x\n@@ @@\n-a\n+b\n"
+    patch = sbs.extract_patch(text)
+    assert patch.startswith("diff --git a/x b/x")
+    assert "no fence" not in patch
+
+
+def test_extract_patch_empty_when_no_diff():
+    assert sbs.extract_patch("") == ""
+    # Prose with no diff markers falls back to the raw text (harness will reject).
+    assert sbs.extract_patch("just words").strip() == "just words"
+
+
+# --- samples -> predictions ------------------------------------------------
+
+def _write_samples(dirpath: Path, records: list[dict]) -> None:
+    with (dirpath / "samples_swebench_lite_2026.jsonl").open("w") as fh:
+        for rec in records:
+            fh.write(json.dumps(rec) + "\n")
+
+
+def test_build_predictions_extracts_instance_and_patch(tmp_path):
+    _write_samples(tmp_path, [
+        {
+            "doc": {"instance_id": "repo__proj-1"},
+            "filtered_resps": ["```diff\ndiff --git a/a b/a\n+x\n```"],
+        },
+        {
+            "doc": {"instance_id": "repo__proj-2"},
+            "resps": [["diff --git a/b b/b\n+y\n"]],
+        },
+    ])
+    preds = sbs.build_predictions(tmp_path, "my-model")
+    by_id = {p["instance_id"]: p for p in preds}
+    assert set(by_id) == {"repo__proj-1", "repo__proj-2"}
+    assert by_id["repo__proj-1"]["model_name_or_path"] == "my-model"
+    assert by_id["repo__proj-1"]["model_patch"].startswith("diff --git a/a b/a")
+    assert by_id["repo__proj-2"]["model_patch"].startswith("diff --git a/b b/b")
+
+
+def test_build_predictions_raises_without_samples(tmp_path):
+    with pytest.raises(FileNotFoundError):
+        sbs.build_predictions(tmp_path, "m")
+
+
+# --- report parsing --------------------------------------------------------
+
+def test_parse_resolved_classic_counts():
+    assert sbs.parse_resolved(
+        {"resolved_instances": 80, "total_instances": 196}
+    ) == (80, 196)
+
+
+def test_parse_resolved_from_id_lists():
+    report = {"resolved_ids": ["a", "b", "c"], "completed_ids": ["a", "b", "c", "d"]}
+    # no total_instances -> falls back to completed_ids length
+    assert sbs.parse_resolved(report) == (3, 4)
+
+
+def test_parse_resolved_raises_on_garbage():
+    with pytest.raises(ValueError):
+        sbs.parse_resolved({"nope": 1})
+
+
+# --- lm-eval-shaped results ------------------------------------------------
+
+def test_build_results_json_is_lm_eval_shaped():
+    res = sbs.build_results_json(
+        "swebench_lite", 49, 196, "m", "0.4.12", {"resolved_instances": 49}
+    )
+    assert "lm_eval_version" in res  # detection key for collect_eval_results
+    task = res["results"]["swebench_lite"]
+    assert task["exact_match,resolved"] == pytest.approx(0.25)
+    cfg = res["configs"]["swebench_lite"]
+    assert cfg["filter_list"] == [{"name": "resolved"}]
+    assert res["n-samples"]["swebench_lite"]["effective"] == 196
+
+
+def test_score_offline_end_to_end(tmp_path):
+    """--report path: samples -> predictions + results JSON, no Docker."""
+    samples = tmp_path / "gen"
+    samples.mkdir()
+    _write_samples(samples, [
+        {"doc": {"instance_id": "r__p-1"}, "filtered_resps": ["```diff\ndiff --git a/a b/a\n+x\n```"]},
+    ])
+    report = tmp_path / "report.json"
+    report.write_text(json.dumps({"resolved_instances": 1, "total_instances": 1}))
+    out = tmp_path / "out"
+    rc = sbs.main([
+        "--samples-dir", str(samples), "--out-dir", str(out),
+        "--model-name", "m", "--report", str(report),
+    ])
+    assert rc == 0
+    assert (out / "predictions.jsonl").exists()
+    results = json.loads((out / "results_swebench_lite.json").read_text())
+    assert results["results"]["swebench_lite"]["exact_match,resolved"] == 1.0
+
+
+def test_predictions_only_writes_predictions_no_results(tmp_path):
+    """SWEBENCH_SKIP_SCORE path: predictions only, no Docker, no results JSON."""
+    samples = tmp_path / "gen"
+    samples.mkdir()
+    _write_samples(samples, [
+        {"doc": {"instance_id": "r__p-1"}, "filtered_resps": ["```diff\ndiff --git a/a b/a\n+x\n```"]},
+    ])
+    out = tmp_path / "out"
+    rc = sbs.main([
+        "--samples-dir", str(samples), "--out-dir", str(out),
+        "--model-name", "m", "--predictions-only",
+    ])
+    assert rc == 0
+    assert (out / "predictions.jsonl").exists()
+    assert not (out / "results_swebench_lite.json").exists()
+
+
+# --- integration with the existing pipeline (needs tabulate + py3.10+) -----
+
+@pytest.mark.skipif(sys.version_info < (3, 10), reason="repo modules use py3.10 syntax")
+def test_results_json_flows_through_collect_and_validate(tmp_path, monkeypatch):
+    pytest.importorskip("tabulate")
+    import collect_eval_results as cer
+    import validate_scores as vs
+
+    art = tmp_path / "eval"
+    art.mkdir()
+    (art / "meta_env.json").write_text(json.dumps({
+        "infmax_model_prefix": "dsr1", "hw": "b200", "framework": "sglang",
+        "precision": "fp8", "isl": 8192, "osl": 1024,
+    }))
+    res = sbs.build_results_json(
+        "swebench_lite", 150, 300, "dsr1", "0.4.12", None
+    )
+    (art / "results_swebench_lite.json").write_text(json.dumps(res))
+
+    # collect surfaces the resolved-rate as the unified `score`.
+    rows = cer.collect_eval_rows(tmp_path)
+    assert len(rows) == 1
+    assert rows[0]["task"] == "swebench_lite"
+    assert rows[0]["score"] == pytest.approx(0.5)
+
+    # validate_scores gates exact_match,resolved against thresholds.json (0.10).
+    monkeypatch.chdir(art)
+    monkeypatch.setattr(sys, "argv", [
+        "validate_scores.py",
+        "--results-glob", "results_swebench_lite.json",
+    ])
+    assert vs.main() == 0  # 0.5 >= 0.10 default threshold
diff --git a/utils/evals/thresholds.json b/utils/evals/thresholds.json
index d6c091152..cbbe65105 100644
--- a/utils/evals/thresholds.json
+++ b/utils/evals/thresholds.json
@@ -1,7 +1,8 @@
 {
   "default": {
     "gsm8k": 0.90,
-    "gpqa_diamond_cot_n_shot": 0.30
+    "gpqa_diamond_cot_n_shot": 0.30,
+    "swebench_lite": 0.10
   },
   "models": {
     "dsr1": {

From 4886287d3daec61a1f4da3bd9dcdbdb5f014a23d Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Fri, 26 Jun 2026 19:02:58 -0500
Subject: [PATCH 2/4] fix(evals): remove unused Any import in swebench_score.py

Addresses CodeQL (github-code-quality) finding on the PR.
---
 utils/evals/swebench_score.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/evals/swebench_score.py b/utils/evals/swebench_score.py
index edf8ef212..371260443 100644
--- a/utils/evals/swebench_score.py
+++ b/utils/evals/swebench_score.py
@@ -26,7 +26,7 @@
 import subprocess
 import sys
 from pathlib import Path
-from typing import Any, Iterator, Optional
+from typing import Iterator, Optional
 
 DEFAULT_DATASET = "princeton-nlp/SWE-bench_Lite"
 DEFAULT_TASK = "swebench_lite"

From 6d5972abe8ac693a4a7fa845c6035301958b1bd7 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Fri, 26 Jun 2026 19:15:11 -0500
Subject: [PATCH 3/4] fix(evals): address PR review (claude bot) findings on
 swebench

- swebench_score.py: bound the bare-diff fallback to the diff body so trailing
  prose after a patch can't be glued on (would fail git apply -> unresolved,
  suppressing resolved-rate). Add _trim_to_diff_body + regression tests.
- benchmark_lib.sh: derive the scoring dataset from the generation YAML's
  dataset_path so generation/scoring can't diverge; SWEBENCH_DATASET (if set)
  must match or it fails fast. Update docstring + EVALS.md.

CodeQL unused-import (Any) already fixed in 4886287d.
---
 benchmarks/benchmark_lib.sh       | 29 +++++++++++++---
 utils/evals/EVALS.md              |  6 ++--
 utils/evals/swebench_score.py     | 57 ++++++++++++++++++++++++++++---
 utils/evals/test_swebench_eval.py | 27 +++++++++++++++
 4 files changed, 108 insertions(+), 11 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 70353e65d..fb9d35f64 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -977,8 +977,10 @@ META
 # pipeline (append_lm_eval_summary / collect / validate) is unchanged.
 #
 # Env knobs:
-#   SWEBENCH_DATASET       (default princeton-nlp/SWE-bench_Lite)
-#   SWEBENCH_TASK_NAME     (default swebench_lite)
+#   SWEBENCH_TASK_NAME     (default swebench_lite) selects utils/evals/<name>.yaml
+#   SWEBENCH_DATASET       optional; must equal the YAML's dataset_path (the
+#                          scoring dataset is derived from the YAML so generation
+#                          and scoring never diverge) -- mismatch fails fast
 #   SWEBENCH_MAX_WORKERS   (default 4) harness Docker workers
 #   SWEBENCH_NAMESPACE     pass "" on arm/Mac to build images locally
 #   SWEBENCH_SKIP_SCORE    "true" => generate + stage predictions only, no Docker
@@ -989,10 +991,29 @@ run_swebench_eval() {
     local gen_dir
     gen_dir=$(mktemp -d /tmp/swebench_gen-XXXXXX)
 
+    # Keep the scoring dataset in lockstep with the generation YAML: the harness
+    # must score against the same instance set lm-eval generated patches for, or
+    # the instance IDs won't match. Derive it from the task YAML; if
+    # SWEBENCH_DATASET is set it must agree (fail-fast rather than mis-score).
+    local yaml_path="${EVAL_TASKS_DIR:-utils/evals/${task_name}.yaml}"
+    local dataset
+    dataset=$(awk '/^dataset_path:[[:space:]]/{print $2; exit}' "$yaml_path" 2>/dev/null)
+    if [ -z "$dataset" ]; then
+        echo "ERROR: could not read dataset_path from ${yaml_path}" >&2
+        rm -rf "$gen_dir" 2>/dev/null || true
+        return 1
+    fi
+    if [ -n "${SWEBENCH_DATASET:-}" ] && [ "${SWEBENCH_DATASET}" != "$dataset" ]; then
+        echo "ERROR: SWEBENCH_DATASET='${SWEBENCH_DATASET}' disagrees with ${yaml_path} dataset_path='${dataset}'." >&2
+        echo "       Generation and scoring must use the same dataset; edit the YAML or unset SWEBENCH_DATASET." >&2
+        rm -rf "$gen_dir" 2>/dev/null || true
+        return 1
+    fi
+
     # 1. Generation via lm-eval (reuses endpoint wiring, _patch_lm_eval, etc.).
     #    run_lm_eval already passes --log_samples, which is what we consume.
     local prev_tasks_dir="${EVAL_TASKS_DIR:-}"
-    export EVAL_TASKS_DIR="${EVAL_TASKS_DIR:-utils/evals/${task_name}.yaml}"
+    export EVAL_TASKS_DIR="$yaml_path"
     local gen_rc=0
     run_lm_eval "$@" --results-dir "$gen_dir" || gen_rc=$?
     export EVAL_TASKS_DIR="$prev_tasks_dir"
@@ -1031,7 +1052,7 @@ run_swebench_eval() {
         --out-dir "$out_dir" \
         --model-name "${MODEL_NAME:-$MODEL}" \
         --task-name "$task_name" \
-        --dataset-name "${SWEBENCH_DATASET:-princeton-nlp/SWE-bench_Lite}" \
+        --dataset-name "$dataset" \
         --max-workers "${SWEBENCH_MAX_WORKERS:-4}" \
         --lm-eval-version "$lm_eval_version" \
         ${SWEBENCH_NAMESPACE+--namespace "$SWEBENCH_NAMESPACE"} \
diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md
index 752e13131..a7738defc 100644
--- a/utils/evals/EVALS.md
+++ b/utils/evals/EVALS.md
@@ -187,8 +187,10 @@ append_lm_eval_summary
 - Scoring: `utils/evals/swebench_score.py` (diff extraction → `predictions.jsonl` →
   `python -m swebench.harness.run_evaluation` → resolved-rate → results JSON). Offline
   `--report` mode skips Docker for testing.
-- Knobs: `SWEBENCH_DATASET`, `SWEBENCH_TASK_NAME`, `SWEBENCH_MAX_WORKERS`,
-  `SWEBENCH_NAMESPACE` (pass `""` on arm/Mac), `SWEBENCH_SKIP_SCORE=true` (generate-only).
+- Knobs: `SWEBENCH_TASK_NAME` (selects the YAML), `SWEBENCH_MAX_WORKERS`,
+  `SWEBENCH_NAMESPACE` (pass `""` on arm/Mac), `SWEBENCH_SKIP_SCORE=true` (generate-only). The
+  scoring dataset is derived from the YAML's `dataset_path` so generation and scoring can't diverge;
+  `SWEBENCH_DATASET`, if set, must match it (mismatch fails fast).
 - **Requires Docker + ~120 GB disk on the scoring host.** This is an MVP; the single-shot prompt and
   diff extraction still need tuning to reach published resolved-rates, and the `thresholds.json` entry
   needs calibration from a baseline run.
diff --git a/utils/evals/swebench_score.py b/utils/evals/swebench_score.py
index 371260443..c1e511ed0 100644
--- a/utils/evals/swebench_score.py
+++ b/utils/evals/swebench_score.py
@@ -39,6 +39,44 @@
 )
 _DIFF_GIT_RE = re.compile(r"(?:^|\n)(diff --git .*)", re.DOTALL)
 
+# Line prefixes that belong to a (git) unified-diff body. Anything else marks
+# the end of the patch.
+_DIFF_LINE_PREFIXES = (
+    "diff ", "index ", "--- ", "+++ ", "@@", "+", "-", " ", "\\",
+    "old mode ", "new mode ", "new file mode ", "deleted file mode ",
+    "rename ", "copy ", "similarity ", "dissimilarity ",
+    "Binary files ", "GIT binary patch",
+)
+
+
+def _trim_to_diff_body(text: str) -> str:
+    """Keep only the leading run of diff-shaped lines, dropping trailing prose.
+
+    Models frequently emit a bare patch followed by an explanation ("Notes:",
+    "This fixes #123."). With no terminator that tail gets glued onto the patch
+    and rejected by ``git apply``, scoring the instance unresolved. Blank lines
+    are kept only when the diff resumes after them; a blank line followed by
+    non-diff text ends the patch.
+    """
+    lines = text.splitlines()
+    out: list[str] = []
+    i, n = 0, len(lines)
+    while i < n:
+        if lines[i].startswith(_DIFF_LINE_PREFIXES):
+            out.append(lines[i])
+            i += 1
+            continue
+        if lines[i] == "":
+            j = i
+            while j < n and lines[j] == "":
+                j += 1
+            if j < n and lines[j].startswith(_DIFF_LINE_PREFIXES):
+                out.extend(lines[i:j])  # interior blank line(s); diff resumes
+                i = j
+                continue
+        break  # trailing blank(s)+prose, or any other non-diff line
+    return "\n".join(out)
+
 
 def extract_patch(text: str) -> str:
     """Pull a unified diff out of a model generation.
@@ -50,19 +88,28 @@ def extract_patch(text: str) -> str:
     """
     if not text:
         return ""
+
+    def _finish(body: str) -> str:
+        body = _trim_to_diff_body(body).strip("\n")
+        return body + "\n" if body else ""
+
     # 1. Prefer a fenced block that actually looks like a diff.
     for match in _FENCED_DIFF_RE.finditer(text):
         body = match.group("body")
         if "diff --git" in body or body.lstrip().startswith(("--- ", "+++ ")):
-            return body.strip("\n") + "\n"
-    # 2. Fall back to the first ``diff --git`` to end-of-text.
+            return _finish(body)
+    # 2. Fall back to a bare ``diff --git``, trimmed to the diff body so
+    #    trailing prose can't corrupt the patch.
     git_match = _DIFF_GIT_RE.search(text)
     if git_match:
-        return git_match.group(1).strip("\n") + "\n"
-    # 3. Last resort: a lone fenced block, or the raw text.
+        trimmed = _finish(git_match.group(1))
+        if trimmed:
+            return trimmed
+    # 3. Last resort: a lone fenced block (fence-bounded), or the raw text.
     lone = _FENCED_DIFF_RE.search(text)
     if lone:
-        return lone.group("body").strip("\n") + "\n"
+        body = lone.group("body").strip("\n")
+        return body + "\n" if body else ""
     return text.strip("\n") + "\n" if text.strip() else ""
 
 
diff --git a/utils/evals/test_swebench_eval.py b/utils/evals/test_swebench_eval.py
index 8e59975ca..067742759 100644
--- a/utils/evals/test_swebench_eval.py
+++ b/utils/evals/test_swebench_eval.py
@@ -40,6 +40,33 @@ def test_extract_patch_bare_diff_git():
     assert "no fence" not in patch
 
 
+def test_extract_patch_bare_diff_strips_trailing_prose():
+    # A bare diff followed by an explanation must not glue the prose onto the
+    # patch (git apply would reject it -> instance scored unresolved).
+    text = (
+        "diff --git a/x b/x\n--- a/x\n+++ b/x\n@@ -1 +1 @@\n-old\n+new\n"
+        "\nNotes:\nThis fixes #123.\n"
+    )
+    patch = sbs.extract_patch(text)
+    assert patch.rstrip().endswith("+new")
+    assert "Notes:" not in patch
+    assert "This fixes" not in patch
+
+
+def test_extract_patch_keeps_multi_file_and_interior_context():
+    # Multiple files + a blank context line (represented as " ") stay intact.
+    text = (
+        "```diff\n"
+        "diff --git a/a b/a\n@@ -1,2 +1,2 @@\n context\n-x\n+y\n"
+        "diff --git a/b b/b\n@@ -1 +1 @@\n-p\n+q\n"
+        "```\nthanks!"
+    )
+    patch = sbs.extract_patch(text)
+    assert "diff --git a/a b/a" in patch
+    assert "diff --git a/b b/b" in patch
+    assert "thanks" not in patch
+
+
 def test_extract_patch_empty_when_no_diff():
     assert sbs.extract_patch("") == ""
     # Prose with no diff markers falls back to the raw text (harness will reject).

From ce358e4049bc699fc14fc9780e129dd046c45836 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Fri, 26 Jun 2026 20:42:24 -0500
Subject: [PATCH 4/4] feat(evals): /run-evals comment command to run one eval
 on one recipe

Adds a lightweight, eval-only PR-comment trigger (no perf sweep):
  /run-evals <eval> <config-key> [conc] [master-config]
e.g. /run-evals swebench_lite dsr1-fp4-b200-sglang 16

- benchmark_lib.sh: EVAL_FRAMEWORK env now wins over run_eval's --framework arg
  so an eval-only run can override the recipes' hardcoded lm-eval (default
  behavior unchanged when env unset).
- e2e-tests.yml / benchmark-tmpl.yml / benchmark-multinode-tmpl.yml: thread new
  eval-framework + eval-task inputs -> EVAL_FRAMEWORK / EVAL_TASKS_DIR env.
- run-evals.yml: new comment workflow (mirrors pr-comment-sweep.yml auth/SHA-pin/
  reply), maps <eval> -> framework+task, infers nvidia/amd master config from the
  config-key HW token, builds 'test-config ... --evals-only', calls e2e-tests.yml.
- test_run_eval_dispatch.py: unit tests for the env-override dispatch.

NOTE: swebench scoring needs Docker on the eval runner (else SWEBENCH_SKIP_SCORE).
---
 .../workflows/benchmark-multinode-tmpl.yml    |  12 ++
 .github/workflows/benchmark-tmpl.yml          |  12 ++
 .github/workflows/e2e-tests.yml               |  24 +++
 .github/workflows/run-evals.yml               | 192 ++++++++++++++++++
 benchmarks/benchmark_lib.sh                   |   9 +-
 utils/evals/test_run_eval_dispatch.py         |  53 +++++
 6 files changed, 300 insertions(+), 2 deletions(-)
 create mode 100644 .github/workflows/run-evals.yml
 create mode 100644 utils/evals/test_run_eval_dispatch.py

diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
index 3beb246cc..b8600b049 100644
--- a/.github/workflows/benchmark-multinode-tmpl.yml
+++ b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -91,6 +91,16 @@ on:
         type: string
         required: false
         default: ""
+      eval-framework:
+        description: "Eval framework (lm-eval | swebench). Empty = recipe default."
+        type: string
+        required: false
+        default: ""
+      eval-task:
+        description: "Eval task YAML path. Empty = framework default."
+        type: string
+        required: false
+        default: ""
       scenario-type:
         description: "Scenario type (fixed-seq-len or agentic-coding)"
         type: string
@@ -143,6 +153,8 @@ env:
   RUN_EVAL: ${{ inputs.run-eval }}
   EVAL_ONLY: ${{ inputs.eval-only }}
   EVAL_CONC: ${{ inputs.eval-conc }}
+  EVAL_FRAMEWORK: ${{ inputs.eval-framework }}
+  EVAL_TASKS_DIR: ${{ inputs.eval-task }}
   SCENARIO_TYPE: ${{ inputs.scenario-type }}
   SCENARIO_SUBDIR: ${{ inputs.scenario-type == 'agentic-coding' && 'agentic/' || 'fixed_seq_len/' }}
   IS_AGENTIC: ${{ inputs.scenario-type == 'agentic-coding' && '1' || '0' }}
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index a57e89725..15c052207 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -59,6 +59,16 @@ on:
         type: boolean
         required: false
         default: false
+      eval-framework:
+        description: "Eval framework (lm-eval | swebench). Empty = recipe default."
+        type: string
+        required: false
+        default: ""
+      eval-task:
+        description: "Eval task YAML path. Empty = framework default."
+        type: string
+        required: false
+        default: ""
       random-range-ratio:
         required: false
         type: string
@@ -108,6 +118,8 @@ env:
   DISAGG: ${{ inputs.disagg }}
   RUN_EVAL: ${{ inputs.run-eval }}
   EVAL_ONLY: ${{ inputs.eval-only }}
+  EVAL_FRAMEWORK: ${{ inputs.eval-framework }}
+  EVAL_TASKS_DIR: ${{ inputs.eval-task }}
   SCENARIO_TYPE: ${{ inputs.scenario-type }}
   SCENARIO_SUBDIR: ${{ inputs.scenario-type == 'agentic-coding' && 'agentic/' || 'fixed_seq_len/' }}
   IS_AGENTIC: ${{ inputs.scenario-type == 'agentic-coding' && '1' || '0' }}
diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
index 1b83a798a..b5b65e8cf 100644
--- a/.github/workflows/e2e-tests.yml
+++ b/.github/workflows/e2e-tests.yml
@@ -21,6 +21,16 @@ on:
                 required: false
                 type: string
                 default: ""
+            eval-framework:
+                description: "Eval framework (lm-eval | swebench). Overrides the recipe default."
+                required: false
+                type: string
+                default: "lm-eval"
+            eval-task:
+                description: "Eval task YAML path (e.g. utils/evals/gsm8k.yaml). Empty = framework default."
+                required: false
+                type: string
+                default: ""
     workflow_call:
         inputs:
             generate-cli-command:
@@ -40,6 +50,16 @@ on:
                 required: false
                 type: string
                 default: ""
+            eval-framework:
+                description: "Eval framework (lm-eval | swebench). Overrides the recipe default."
+                required: false
+                type: string
+                default: "lm-eval"
+            eval-task:
+                description: "Eval task YAML path (e.g. utils/evals/gsm8k.yaml). Empty = framework default."
+                required: false
+                type: string
+                default: ""
 
 jobs:
     get-jobs:
@@ -160,6 +180,8 @@ jobs:
             run-eval: true
             eval-only: true
             eval-conc: ${{ matrix.config['eval-all-concs'] && join(matrix.config.conc, ' ') || matrix.config['eval-conc'] }}
+            eval-framework: ${{ inputs.eval-framework }}
+            eval-task: ${{ inputs.eval-task }}
             ref: ${{ inputs.ref }}
 
     test-sweep-agentic:
@@ -294,6 +316,8 @@ jobs:
             disagg: ${{ matrix.config.disagg }}
             run-eval: true
             eval-only: true
+            eval-framework: ${{ inputs.eval-framework }}
+            eval-task: ${{ inputs.eval-task }}
             ref: ${{ inputs.ref }}
 
     collect-results:
diff --git a/.github/workflows/run-evals.yml b/.github/workflows/run-evals.yml
new file mode 100644
index 000000000..1d6b5d06e
--- /dev/null
+++ b/.github/workflows/run-evals.yml
@@ -0,0 +1,192 @@
+name: Slash Command Run Evals
+run-name: "/run-evals PR #${{ github.event.issue.number }}"
+
+# Comment-triggered, eval-only run of ONE eval on ONE recipe (no perf sweep).
+# Usage in a PR comment:
+#   /run-evals <eval> <config-key> [conc] [master-config]
+# where <eval> is one of: gsm8k | gpqa | swebench_lite (alias: swebench).
+# Example: /run-evals swebench_lite dsr1-fp4-b200-sglang 16
+# Mirrors pr-comment-sweep.yml; differs only in parsing + the eval mapping it
+# forwards to e2e-tests.yml (eval-framework / eval-task).
+
+on:
+  issue_comment:
+    types: [created]
+
+permissions:
+  contents: read
+  issues: write
+  pull-requests: write
+
+jobs:
+  get-jobs:
+    # Only run for PR comments that start with /run-evals
+    if: ${{ github.event.issue.pull_request && startsWith(github.event.comment.body, '/run-evals') }}
+    runs-on: ubuntu-latest
+    outputs:
+      pr-number: ${{ steps.parse.outputs.pr-number }}
+      generator-args: ${{ steps.parse.outputs.generator-args }}
+      eval-framework: ${{ steps.parse.outputs.eval-framework }}
+      eval-task: ${{ steps.parse.outputs.eval-task }}
+      author-can-bypass: ${{ steps.auth.outputs.can-bypass }}
+      # Immutable ref (commit SHA) to prevent TOCTOU on refs/pull/<n>/head
+      ref: ${{ steps.ref_comment.outputs.ref }}
+    steps:
+      - name: Parse PR comment (/run-evals <eval> <config-key> [conc] [master])
+        id: parse
+        if: ${{ github.event_name == 'issue_comment' && github.event.issue.pull_request && startsWith(github.event.comment.body, '/run-evals') }}
+        shell: bash
+        env:
+          BODY: ${{ github.event.comment.body }}
+          PR_NUMBER: ${{ github.event.issue.number }}
+        run: |
+          set -euo pipefail
+          # Require /run-evals at the start of a line.
+          cmd_line=$(printf "%s" "$BODY" | awk '/^\/run-evals/{print; exit}')
+          if [[ -z "$cmd_line" ]]; then
+            echo "No /run-evals command found at comment start" >&2
+            exit 1
+          fi
+          # Positional args after the command.
+          read -ra parts <<< "${cmd_line#/run-evals}"
+          eval_name="${parts[0]:-}"
+          config_key="${parts[1]:-}"
+          conc="${parts[2]:-}"
+          master_override="${parts[3]:-}"
+          if [[ -z "$eval_name" || -z "$config_key" ]]; then
+            echo "usage: /run-evals <eval> <config-key> [conc] [master-config]" >&2
+            echo "valid evals: gsm8k | gpqa | swebench_lite" >&2
+            exit 1
+          fi
+
+          # Map <eval> -> (framework, task YAML).
+          case "$eval_name" in
+            gsm8k)                  framework="lm-eval";  task="utils/evals/gsm8k.yaml" ;;
+            gpqa|gpqa_diamond)      framework="lm-eval";  task="utils/evals/gpqa_diamond.yaml" ;;
+            swebench|swebench_lite) framework="swebench"; task="utils/evals/swebench_lite.yaml" ;;
+            *)
+              echo "unknown eval '$eval_name' (valid: gsm8k, gpqa, swebench_lite)" >&2
+              exit 1
+              ;;
+          esac
+
+          if [[ -n "$conc" && ! "$conc" =~ ^[1-9][0-9]*$ ]]; then
+            echo "conc must be a positive integer, got '$conc'" >&2
+            exit 1
+          fi
+
+          # Pick the platform master config from the config-key's hardware token,
+          # unless an explicit 4th arg overrides it.
+          if [[ -n "$master_override" ]]; then
+            master="$master_override"
+          elif [[ "$config_key" =~ (b200|b300|h100|h200|gb200|gb300) ]]; then
+            master=".github/configs/nvidia-master.yaml"
+          elif [[ "$config_key" =~ (mi300x|mi325x|mi355x) ]]; then
+            master=".github/configs/amd-master.yaml"
+          else
+            echo "cannot infer platform from config-key '$config_key'; pass the master config path as a 4th arg" >&2
+            exit 1
+          fi
+
+          gen="test-config --config-files ${master} --config-keys ${config_key} --evals-only"
+          if [[ -n "$conc" ]]; then
+            gen="${gen} --conc ${conc}"
+          fi
+
+          {
+            echo "generator-args=${gen}"
+            echo "eval-framework=${framework}"
+            echo "eval-task=${task}"
+            echo "pr-number=${PR_NUMBER}"
+          } >> "$GITHUB_OUTPUT"
+
+      - name: Check author permissions
+        id: auth
+        if: ${{ github.event_name == 'issue_comment' && github.event.issue.pull_request }}
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+        with:
+          script: |
+            const owner = context.repo.owner;
+            const repo = context.repo.repo;
+            const username = context.payload.comment?.user?.login;
+            let permission = 'none';
+            try {
+              const res = await github.rest.repos.getCollaboratorPermissionLevel({ owner, repo, username });
+              permission = res.data?.permission || 'none';
+            } catch (e) {
+              permission = 'none';
+            }
+            const canBypass = ['admin','maintain','write'].includes(permission);
+            core.info(`Author ${username} permission: ${permission}; bypass=${canBypass}`);
+            core.setOutput('can-bypass', canBypass ? 'true' : 'false');
+
+      # ---- PR SHA pinning ----
+      - name: Resolve immutable PR ref (pin to head SHA)
+        id: ref_comment
+        if: ${{ github.event_name == 'issue_comment' && github.event.issue.pull_request && startsWith(github.event.comment.body, '/run-evals') }}
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+        with:
+          script: |
+            const owner = context.repo.owner;
+            const repo = context.repo.repo;
+            const pr = context.issue.number;
+            const res = await github.rest.pulls.get({ owner, repo, pull_number: pr });
+            const sha = res.data.head.sha;
+            core.info(`Resolved PR #${pr} head SHA: ${sha}`);
+            core.setOutput('ref', sha);
+
+      - name: Reply with run link
+        if: ${{ github.event_name == 'issue_comment' && startsWith(github.event.comment.body, '/run-evals') && github.repository_owner == 'SemiAnalysisAI' }}
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+        continue-on-error: true
+        env:
+          RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+          AUTHOR: ${{ github.event.comment.user.login }}
+          GEN_CMD: ${{ steps.parse.outputs.generator-args }}
+          EVAL_FRAMEWORK: ${{ steps.parse.outputs.eval-framework }}
+          CAN_BYPASS: ${{ steps.auth.outputs.can-bypass }}
+          PINNED_REF: ${{ steps.ref_comment.outputs.ref }}
+        with:
+          github-token: ${{ github.token }}
+          script: |
+            const owner = context.repo.owner;
+            const repo = context.repo.repo;
+            const issue_number = context.issue.number;
+            const runUrl = process.env.RUN_URL;
+            const author = process.env.AUTHOR;
+            const genCmd = process.env.GEN_CMD || '';
+            const framework = process.env.EVAL_FRAMEWORK || '';
+            const canBypass = (process.env.CAN_BYPASS || '').toLowerCase() === 'true';
+            const pinned = process.env.PINNED_REF || '';
+            const shortSha = pinned ? pinned.slice(0, 7) : '';
+            const approvalMsg = canBypass ? 'Approval: not required (trusted collaborator).' : "Approval: required in environment 'Outside Collaborator E2E Test'.";
+            const body = `@${author} Kicking off an eval-only run (framework: \`${framework}\`).\n\nRun: ${runUrl}\nCommand: \`${genCmd}\`\nPinned ref: \`${shortSha}\`\n${approvalMsg}`;
+            await github.rest.issues.createComment({ owner, repo, issue_number, body });
+
+  approval:
+    needs: get-jobs
+    if: ${{ github.event_name == 'issue_comment' && needs.get-jobs.outputs.pr-number != '' && needs.get-jobs.outputs.generator-args != '' && needs.get-jobs.outputs.author-can-bypass != 'true' }}
+    runs-on: ubuntu-latest
+    name: approval
+    environment: Outside Collaborator E2E Test
+    steps:
+      - run: echo "approved"
+
+  validate:
+    needs: [get-jobs, approval]
+    # always() is required to evaluate this condition when 'approval' is skipped (trusted author)
+    if: ${{ always() && needs.get-jobs.result == 'success' && needs.get-jobs.outputs.pr-number != '' && needs.get-jobs.outputs.generator-args != '' && (needs.get-jobs.outputs.author-can-bypass == 'true' || needs.approval.result == 'success') }}
+    # Concurrency at job level so non-/run-evals comments don't cancel active runs
+    concurrency:
+      group: "run-evals-PR#${{ needs.get-jobs.outputs.pr-number }}"
+      cancel-in-progress: true
+    uses: ./.github/workflows/e2e-tests.yml
+    name: validate
+    secrets: inherit
+    with:
+      generate-cli-command: ${{ needs.get-jobs.outputs.generator-args }}
+      eval-framework: ${{ needs.get-jobs.outputs.eval-framework }}
+      eval-task: ${{ needs.get-jobs.outputs.eval-task }}
+      test-name: PR #${{ needs.get-jobs.outputs.pr-number }} /run-evals
+      # Use pinned SHA to prevent TOCTOU on refs/pull/<n>/head
+      ref: ${{ needs.get-jobs.outputs.ref }}
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index fb9d35f64..060b0e926 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -1069,15 +1069,20 @@ run_swebench_eval() {
 # ------------------------------
 
 run_eval() {
-    local framework="${EVAL_FRAMEWORK:-lm-eval}"
+    # EVAL_FRAMEWORK (env) wins over the --framework arg so an eval-only run
+    # (e.g. the /run-evals command) can override the recipe scripts' hardcoded
+    # `--framework lm-eval`. With the env unset, the CLI arg (else lm-eval) is
+    # used exactly as before.
+    local cli_framework=""
     local forwarded=()
 
     while [[ $# -gt 0 ]]; do
         case "$1" in
-            --framework) framework="$2"; shift 2 ;;
+            --framework) cli_framework="$2"; shift 2 ;;
             *)           forwarded+=("$1"); shift ;;
         esac
     done
+    local framework="${EVAL_FRAMEWORK:-${cli_framework:-lm-eval}}"
 
     # Compute EVAL_MAX_MODEL_LEN if not already set by the calling script
     if [ -z "${EVAL_MAX_MODEL_LEN:-}" ]; then
diff --git a/utils/evals/test_run_eval_dispatch.py b/utils/evals/test_run_eval_dispatch.py
new file mode 100644
index 000000000..44a5d785a
--- /dev/null
+++ b/utils/evals/test_run_eval_dispatch.py
@@ -0,0 +1,53 @@
+"""run_eval framework dispatch: EVAL_FRAMEWORK (env) overrides the --framework arg.
+
+This is what lets `/run-evals swebench_lite ...` run swebench even though every
+recipe script hardcodes `run_eval --framework lm-eval`. With the env unset, the
+CLI arg (else lm-eval) is used as before.
+"""
+
+import os
+import subprocess
+from pathlib import Path
+
+BENCHMARK_LIB = Path(__file__).resolve().parents[2] / "benchmarks" / "benchmark_lib.sh"
+
+# Stub the framework runners so dispatch is observable without a server/Docker,
+# and pin EVAL_MAX_MODEL_LEN so run_eval skips context computation.
+_SCRIPT = r'''
+source "$BENCHMARK_LIB"
+run_lm_eval()       { echo "DISPATCH=lm-eval"; }
+run_swebench_eval() { echo "DISPATCH=swebench"; }
+export EVAL_MAX_MODEL_LEN=16384
+unset EVAL_CONCURRENT_REQUESTS
+run_eval --framework "$CLI_FW" --port 8888
+'''
+
+
+def _dispatch(cli_fw: str, env_fw: str | None) -> str:
+    env = {**os.environ, "BENCHMARK_LIB": str(BENCHMARK_LIB), "CLI_FW": cli_fw}
+    env.pop("EVAL_FRAMEWORK", None)
+    if env_fw is not None:
+        env["EVAL_FRAMEWORK"] = env_fw
+    res = subprocess.run(
+        ["bash", "-c", _SCRIPT], env=env, text=True, capture_output=True, check=True
+    )
+    return res.stdout
+
+
+def test_env_framework_overrides_cli_arg():
+    # recipe passes --framework lm-eval, but EVAL_FRAMEWORK=swebench wins.
+    assert "DISPATCH=swebench" in _dispatch("lm-eval", "swebench")
+
+
+def test_cli_arg_used_when_env_unset():
+    assert "DISPATCH=lm-eval" in _dispatch("lm-eval", None)
+
+
+def test_swebench_via_cli_arg_when_env_unset():
+    assert "DISPATCH=swebench" in _dispatch("swebench", None)
+
+
+def test_empty_env_falls_back_to_cli_arg():
+    # An empty EVAL_FRAMEWORK (how the template passes it when unset) must not
+    # force anything -- the CLI arg still wins.
+    assert "DISPATCH=lm-eval" in _dispatch("lm-eval", "")