SemiAnalysisAI · adibarra · Jun 26, 2026 · Jun 27, 2026 · Jun 27, 2026 · Jun 27, 2026
diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -91,6 +91,16 @@ on:
         type: string
         required: false
         default: ""
+      eval-framework:
+        description: "Eval framework (lm-eval | swebench). Empty = recipe default."
+        type: string
+        required: false
+        default: ""
+      eval-task:
+        description: "Eval task YAML path. Empty = framework default."
+        type: string
+        required: false
+        default: ""
       scenario-type:
         description: "Scenario type (fixed-seq-len or agentic-coding)"
         type: string
@@ -143,6 +153,8 @@ env:
   RUN_EVAL: ${{ inputs.run-eval }}
   EVAL_ONLY: ${{ inputs.eval-only }}
   EVAL_CONC: ${{ inputs.eval-conc }}
+  EVAL_FRAMEWORK: ${{ inputs.eval-framework }}
+  EVAL_TASKS_DIR: ${{ inputs.eval-task }}
   SCENARIO_TYPE: ${{ inputs.scenario-type }}
   SCENARIO_SUBDIR: ${{ inputs.scenario-type == 'agentic-coding' && 'agentic/' || 'fixed_seq_len/' }}
   IS_AGENTIC: ${{ inputs.scenario-type == 'agentic-coding' && '1' || '0' }}

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
@@ -59,6 +59,16 @@ on:
         type: boolean
         required: false
         default: false
+      eval-framework:
+        description: "Eval framework (lm-eval | swebench). Empty = recipe default."
+        type: string
+        required: false
+        default: ""
+      eval-task:
+        description: "Eval task YAML path. Empty = framework default."
+        type: string
+        required: false
+        default: ""
       random-range-ratio:
         required: false
         type: string
@@ -108,6 +118,8 @@ env:
   DISAGG: ${{ inputs.disagg }}
   RUN_EVAL: ${{ inputs.run-eval }}
   EVAL_ONLY: ${{ inputs.eval-only }}
+  EVAL_FRAMEWORK: ${{ inputs.eval-framework }}
+  EVAL_TASKS_DIR: ${{ inputs.eval-task }}
   SCENARIO_TYPE: ${{ inputs.scenario-type }}
   SCENARIO_SUBDIR: ${{ inputs.scenario-type == 'agentic-coding' && 'agentic/' || 'fixed_seq_len/' }}
   IS_AGENTIC: ${{ inputs.scenario-type == 'agentic-coding' && '1' || '0' }}

diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
@@ -21,6 +21,16 @@
                 required: false
                 type: string
                 default: ""
+            eval-framework:
+                description: "Eval framework (lm-eval | swebench). Overrides the recipe default."
+                required: false
+                type: string
+                default: "lm-eval"
+            eval-task:
+                description: "Eval task YAML path (e.g. utils/evals/gsm8k.yaml). Empty = framework default."
+                required: false
+                type: string
+                default: ""
     workflow_call:
         inputs:
             generate-cli-command:
@@ -40,29 +50,39 @@
                 required: false
                 type: string
                 default: ""
+            eval-framework:
+                description: "Eval framework (lm-eval | swebench). Overrides the recipe default."
+                required: false
+                type: string
+                default: "lm-eval"
+            eval-task:
+                description: "Eval task YAML path (e.g. utils/evals/gsm8k.yaml). Empty = framework default."
+                required: false
+                type: string
+                default: ""
 
 jobs:
     get-jobs:
        runs-on: ubuntu-latest
        outputs:
            single-node-config: ${{ steps.get-jobs.outputs.single-node-config }}
            multi-node-config: ${{ steps.get-jobs.outputs.multi-node-config }}
            eval-config: ${{ steps.get-jobs.outputs.eval-config }}
            multi-node-eval-config: ${{ steps.get-jobs.outputs.multi-node-eval-config }}
            agentic-config: ${{ steps.get-jobs.outputs.agentic-config }}
            multi-node-agentic-config: ${{ steps.get-jobs.outputs.multi-node-agentic-config }}
        steps:
            - name: Checkout code (ref)
              if: ${{ inputs.ref && inputs.ref != '' }}
              uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
              with:
                ref: ${{ inputs.ref }}

            - name: Checkout code (default)
              if: ${{ !inputs.ref || inputs.ref == '' }}
              uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
              with:
                ref: ${{ github.sha }}

            - id: get-jobs
              run: |
@@ -160,6 +180,8 @@
             run-eval: true
             eval-only: true
             eval-conc: ${{ matrix.config['eval-all-concs'] && join(matrix.config.conc, ' ') || matrix.config['eval-conc'] }}
+            eval-framework: ${{ inputs.eval-framework }}
+            eval-task: ${{ inputs.eval-task }}
             ref: ${{ inputs.ref }}
 
     test-sweep-agentic:
@@ -294,6 +316,8 @@
             disagg: ${{ matrix.config.disagg }}
             run-eval: true
             eval-only: true
+            eval-framework: ${{ inputs.eval-framework }}
+            eval-task: ${{ inputs.eval-task }}
             ref: ${{ inputs.ref }}
 
     collect-results:

diff --git a/.github/workflows/run-evals.yml b/.github/workflows/run-evals.yml
@@ -0,0 +1,192 @@
+name: Slash Command Run Evals
+run-name: "/run-evals PR #${{ github.event.issue.number }}"
+
+# Comment-triggered, eval-only run of ONE eval on ONE recipe (no perf sweep).
+# Usage in a PR comment:
+#   /run-evals <eval> <config-key> [conc] [master-config]
+# where <eval> is one of: gsm8k | gpqa | swebench_lite (alias: swebench).
+# Example: /run-evals swebench_lite dsr1-fp4-b200-sglang 16
+# Mirrors pr-comment-sweep.yml; differs only in parsing + the eval mapping it
+# forwards to e2e-tests.yml (eval-framework / eval-task).
+
+on:
+  issue_comment:
+    types: [created]
+
+permissions:
+  contents: read
+  issues: write
+  pull-requests: write
+
+jobs:
+  get-jobs:
+    # Only run for PR comments that start with /run-evals
+    if: ${{ github.event.issue.pull_request && startsWith(github.event.comment.body, '/run-evals') }}
+    runs-on: ubuntu-latest
+    outputs:
+      pr-number: ${{ steps.parse.outputs.pr-number }}
+      generator-args: ${{ steps.parse.outputs.generator-args }}
+      eval-framework: ${{ steps.parse.outputs.eval-framework }}
+      eval-task: ${{ steps.parse.outputs.eval-task }}
+      author-can-bypass: ${{ steps.auth.outputs.can-bypass }}
+      # Immutable ref (commit SHA) to prevent TOCTOU on refs/pull/<n>/head
+      ref: ${{ steps.ref_comment.outputs.ref }}
+    steps:
+      - name: Parse PR comment (/run-evals <eval> <config-key> [conc] [master])
+        id: parse
+        if: ${{ github.event_name == 'issue_comment' && github.event.issue.pull_request && startsWith(github.event.comment.body, '/run-evals') }}
+        shell: bash
+        env:
+          BODY: ${{ github.event.comment.body }}
+          PR_NUMBER: ${{ github.event.issue.number }}
+        run: |
+          set -euo pipefail
+          # Require /run-evals at the start of a line.
+          cmd_line=$(printf "%s" "$BODY" | awk '/^\/run-evals/{print; exit}')
+          if [[ -z "$cmd_line" ]]; then
+            echo "No /run-evals command found at comment start" >&2
+            exit 1
+          fi
+          # Positional args after the command.
+          read -ra parts <<< "${cmd_line#/run-evals}"
+          eval_name="${parts[0]:-}"
+          config_key="${parts[1]:-}"
+          conc="${parts[2]:-}"
+          master_override="${parts[3]:-}"
+          if [[ -z "$eval_name" || -z "$config_key" ]]; then
+            echo "usage: /run-evals <eval> <config-key> [conc] [master-config]" >&2
+            echo "valid evals: gsm8k | gpqa | swebench_lite" >&2
+            exit 1
+          fi
+
+          # Map <eval> -> (framework, task YAML).
+          case "$eval_name" in
+            gsm8k)                  framework="lm-eval";  task="utils/evals/gsm8k.yaml" ;;
+            gpqa|gpqa_diamond)      framework="lm-eval";  task="utils/evals/gpqa_diamond.yaml" ;;
+            swebench|swebench_lite) framework="swebench"; task="utils/evals/swebench_lite.yaml" ;;
+            *)
+              echo "unknown eval '$eval_name' (valid: gsm8k, gpqa, swebench_lite)" >&2
+              exit 1
+              ;;
+          esac
+
+          if [[ -n "$conc" && ! "$conc" =~ ^[1-9][0-9]*$ ]]; then
+            echo "conc must be a positive integer, got '$conc'" >&2
+            exit 1
+          fi
+
+          # Pick the platform master config from the config-key's hardware token,
+          # unless an explicit 4th arg overrides it.
+          if [[ -n "$master_override" ]]; then
+            master="$master_override"
+          elif [[ "$config_key" =~ (b200|b300|h100|h200|gb200|gb300) ]]; then
+            master=".github/configs/nvidia-master.yaml"
+          elif [[ "$config_key" =~ (mi300x|mi325x|mi355x) ]]; then
+            master=".github/configs/amd-master.yaml"
+          else
+            echo "cannot infer platform from config-key '$config_key'; pass the master config path as a 4th arg" >&2
+            exit 1
+          fi
+
+          gen="test-config --config-files ${master} --config-keys ${config_key} --evals-only"
+          if [[ -n "$conc" ]]; then
+            gen="${gen} --conc ${conc}"
+          fi
+
+          {
+            echo "generator-args=${gen}"
+            echo "eval-framework=${framework}"
+            echo "eval-task=${task}"
+            echo "pr-number=${PR_NUMBER}"
+          } >> "$GITHUB_OUTPUT"
+
+      - name: Check author permissions
+        id: auth
+        if: ${{ github.event_name == 'issue_comment' && github.event.issue.pull_request }}
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+        with:
+          script: |
+            const owner = context.repo.owner;
+            const repo = context.repo.repo;
+            const username = context.payload.comment?.user?.login;
+            let permission = 'none';
+            try {
+              const res = await github.rest.repos.getCollaboratorPermissionLevel({ owner, repo, username });
+              permission = res.data?.permission || 'none';
+            } catch (e) {
+              permission = 'none';
+            }
+            const canBypass = ['admin','maintain','write'].includes(permission);
+            core.info(`Author ${username} permission: ${permission}; bypass=${canBypass}`);
+            core.setOutput('can-bypass', canBypass ? 'true' : 'false');
+
+      # ---- PR SHA pinning ----
+      - name: Resolve immutable PR ref (pin to head SHA)
+        id: ref_comment
+        if: ${{ github.event_name == 'issue_comment' && github.event.issue.pull_request && startsWith(github.event.comment.body, '/run-evals') }}
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+        with:
+          script: |
+            const owner = context.repo.owner;
+            const repo = context.repo.repo;
+            const pr = context.issue.number;
+            const res = await github.rest.pulls.get({ owner, repo, pull_number: pr });
+            const sha = res.data.head.sha;
+            core.info(`Resolved PR #${pr} head SHA: ${sha}`);
+            core.setOutput('ref', sha);
+
+      - name: Reply with run link
+        if: ${{ github.event_name == 'issue_comment' && startsWith(github.event.comment.body, '/run-evals') && github.repository_owner == 'SemiAnalysisAI' }}
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+        continue-on-error: true
+        env:
+          RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+          AUTHOR: ${{ github.event.comment.user.login }}
+          GEN_CMD: ${{ steps.parse.outputs.generator-args }}
+          EVAL_FRAMEWORK: ${{ steps.parse.outputs.eval-framework }}
+          CAN_BYPASS: ${{ steps.auth.outputs.can-bypass }}
+          PINNED_REF: ${{ steps.ref_comment.outputs.ref }}
+        with:
+          github-token: ${{ github.token }}
+          script: |
+            const owner = context.repo.owner;
+            const repo = context.repo.repo;
+            const issue_number = context.issue.number;
+            const runUrl = process.env.RUN_URL;
+            const author = process.env.AUTHOR;
+            const genCmd = process.env.GEN_CMD || '';
+            const framework = process.env.EVAL_FRAMEWORK || '';
+            const canBypass = (process.env.CAN_BYPASS || '').toLowerCase() === 'true';
+            const pinned = process.env.PINNED_REF || '';
+            const shortSha = pinned ? pinned.slice(0, 7) : '';
+            const approvalMsg = canBypass ? 'Approval: not required (trusted collaborator).' : "Approval: required in environment 'Outside Collaborator E2E Test'.";
+            const body = `@${author} Kicking off an eval-only run (framework: \`${framework}\`).\n\nRun: ${runUrl}\nCommand: \`${genCmd}\`\nPinned ref: \`${shortSha}\`\n${approvalMsg}`;
+            await github.rest.issues.createComment({ owner, repo, issue_number, body });
+
+  approval:
+    needs: get-jobs
+    if: ${{ github.event_name == 'issue_comment' && needs.get-jobs.outputs.pr-number != '' && needs.get-jobs.outputs.generator-args != '' && needs.get-jobs.outputs.author-can-bypass != 'true' }}
+    runs-on: ubuntu-latest
+    name: approval
+    environment: Outside Collaborator E2E Test
+    steps:
+      - run: echo "approved"
+
+  validate:
+    needs: [get-jobs, approval]
+    # always() is required to evaluate this condition when 'approval' is skipped (trusted author)
+    if: ${{ always() && needs.get-jobs.result == 'success' && needs.get-jobs.outputs.pr-number != '' && needs.get-jobs.outputs.generator-args != '' && (needs.get-jobs.outputs.author-can-bypass == 'true' || needs.approval.result == 'success') }}
+    # Concurrency at job level so non-/run-evals comments don't cancel active runs
+    concurrency:
+      group: "run-evals-PR#${{ needs.get-jobs.outputs.pr-number }}"
+      cancel-in-progress: true
+    uses: ./.github/workflows/e2e-tests.yml
+    name: validate
+    secrets: inherit
+    with:
+      generate-cli-command: ${{ needs.get-jobs.outputs.generator-args }}
+      eval-framework: ${{ needs.get-jobs.outputs.eval-framework }}
+      eval-task: ${{ needs.get-jobs.outputs.eval-task }}
+      test-name: PR #${{ needs.get-jobs.outputs.pr-number }} /run-evals
+      # Use pinned SHA to prevent TOCTOU on refs/pull/<n>/head
+      ref: ${{ needs.get-jobs.outputs.ref }}