Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .github/workflows/benchmark-multinode-tmpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,16 @@ on:
type: string
required: false
default: ""
eval-framework:
description: "Eval framework (lm-eval | swebench). Empty = recipe default."
type: string
required: false
default: ""
eval-task:
description: "Eval task YAML path. Empty = framework default."
type: string
required: false
default: ""
scenario-type:
description: "Scenario type (fixed-seq-len or agentic-coding)"
type: string
Expand Down Expand Up @@ -143,6 +153,8 @@ env:
RUN_EVAL: ${{ inputs.run-eval }}
EVAL_ONLY: ${{ inputs.eval-only }}
EVAL_CONC: ${{ inputs.eval-conc }}
EVAL_FRAMEWORK: ${{ inputs.eval-framework }}
EVAL_TASKS_DIR: ${{ inputs.eval-task }}
SCENARIO_TYPE: ${{ inputs.scenario-type }}
SCENARIO_SUBDIR: ${{ inputs.scenario-type == 'agentic-coding' && 'agentic/' || 'fixed_seq_len/' }}
IS_AGENTIC: ${{ inputs.scenario-type == 'agentic-coding' && '1' || '0' }}
Expand Down
12 changes: 12 additions & 0 deletions .github/workflows/benchmark-tmpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,16 @@ on:
type: boolean
required: false
default: false
eval-framework:
description: "Eval framework (lm-eval | swebench). Empty = recipe default."
type: string
required: false
default: ""
eval-task:
description: "Eval task YAML path. Empty = framework default."
type: string
required: false
default: ""
random-range-ratio:
required: false
type: string
Expand Down Expand Up @@ -108,6 +118,8 @@ env:
DISAGG: ${{ inputs.disagg }}
RUN_EVAL: ${{ inputs.run-eval }}
EVAL_ONLY: ${{ inputs.eval-only }}
EVAL_FRAMEWORK: ${{ inputs.eval-framework }}
EVAL_TASKS_DIR: ${{ inputs.eval-task }}
SCENARIO_TYPE: ${{ inputs.scenario-type }}
SCENARIO_SUBDIR: ${{ inputs.scenario-type == 'agentic-coding' && 'agentic/' || 'fixed_seq_len/' }}
IS_AGENTIC: ${{ inputs.scenario-type == 'agentic-coding' && '1' || '0' }}
Expand Down
24 changes: 24 additions & 0 deletions .github/workflows/e2e-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,16 @@
required: false
type: string
default: ""
eval-framework:
description: "Eval framework (lm-eval | swebench). Overrides the recipe default."
required: false
type: string
default: "lm-eval"
eval-task:
description: "Eval task YAML path (e.g. utils/evals/gsm8k.yaml). Empty = framework default."
required: false
type: string
default: ""
workflow_call:
inputs:
generate-cli-command:
Expand All @@ -40,29 +50,39 @@
required: false
type: string
default: ""
eval-framework:
description: "Eval framework (lm-eval | swebench). Overrides the recipe default."
required: false
type: string
default: "lm-eval"
eval-task:
description: "Eval task YAML path (e.g. utils/evals/gsm8k.yaml). Empty = framework default."
required: false
type: string
default: ""

jobs:
get-jobs:
runs-on: ubuntu-latest
outputs:
single-node-config: ${{ steps.get-jobs.outputs.single-node-config }}
multi-node-config: ${{ steps.get-jobs.outputs.multi-node-config }}
eval-config: ${{ steps.get-jobs.outputs.eval-config }}
multi-node-eval-config: ${{ steps.get-jobs.outputs.multi-node-eval-config }}
agentic-config: ${{ steps.get-jobs.outputs.agentic-config }}
multi-node-agentic-config: ${{ steps.get-jobs.outputs.multi-node-agentic-config }}
steps:
- name: Checkout code (ref)
if: ${{ inputs.ref && inputs.ref != '' }}
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
ref: ${{ inputs.ref }}

- name: Checkout code (default)
if: ${{ !inputs.ref || inputs.ref == '' }}
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
ref: ${{ github.sha }}

Check failure

Code scanning / CodeQL

Cache Poisoning via execution of untrusted code High

Potential cache poisoning in the context of the default branch due to privilege checkout of untrusted code. (
issue_comment
).

- id: get-jobs
run: |
Expand Down Expand Up @@ -160,6 +180,8 @@
run-eval: true
eval-only: true
eval-conc: ${{ matrix.config['eval-all-concs'] && join(matrix.config.conc, ' ') || matrix.config['eval-conc'] }}
eval-framework: ${{ inputs.eval-framework }}
eval-task: ${{ inputs.eval-task }}
ref: ${{ inputs.ref }}

test-sweep-agentic:
Expand Down Expand Up @@ -294,6 +316,8 @@
disagg: ${{ matrix.config.disagg }}
run-eval: true
eval-only: true
eval-framework: ${{ inputs.eval-framework }}
eval-task: ${{ inputs.eval-task }}
ref: ${{ inputs.ref }}

collect-results:
Expand Down
192 changes: 192 additions & 0 deletions .github/workflows/run-evals.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
name: Slash Command Run Evals
run-name: "/run-evals PR #${{ github.event.issue.number }}"

# Comment-triggered, eval-only run of ONE eval on ONE recipe (no perf sweep).
# Usage in a PR comment:
# /run-evals <eval> <config-key> [conc] [master-config]
# where <eval> is one of: gsm8k | gpqa | swebench_lite (alias: swebench).
# Example: /run-evals swebench_lite dsr1-fp4-b200-sglang 16
# Mirrors pr-comment-sweep.yml; differs only in parsing + the eval mapping it
# forwards to e2e-tests.yml (eval-framework / eval-task).

on:
issue_comment:
types: [created]

permissions:
contents: read
issues: write
pull-requests: write

jobs:
get-jobs:
# Only run for PR comments that start with /run-evals
if: ${{ github.event.issue.pull_request && startsWith(github.event.comment.body, '/run-evals') }}
runs-on: ubuntu-latest
outputs:
pr-number: ${{ steps.parse.outputs.pr-number }}
generator-args: ${{ steps.parse.outputs.generator-args }}
eval-framework: ${{ steps.parse.outputs.eval-framework }}
eval-task: ${{ steps.parse.outputs.eval-task }}
author-can-bypass: ${{ steps.auth.outputs.can-bypass }}
# Immutable ref (commit SHA) to prevent TOCTOU on refs/pull/<n>/head
ref: ${{ steps.ref_comment.outputs.ref }}
steps:
- name: Parse PR comment (/run-evals <eval> <config-key> [conc] [master])
id: parse
if: ${{ github.event_name == 'issue_comment' && github.event.issue.pull_request && startsWith(github.event.comment.body, '/run-evals') }}
shell: bash
env:
BODY: ${{ github.event.comment.body }}
PR_NUMBER: ${{ github.event.issue.number }}
run: |
set -euo pipefail
# Require /run-evals at the start of a line.
cmd_line=$(printf "%s" "$BODY" | awk '/^\/run-evals/{print; exit}')
if [[ -z "$cmd_line" ]]; then
echo "No /run-evals command found at comment start" >&2
exit 1
fi
# Positional args after the command.
read -ra parts <<< "${cmd_line#/run-evals}"
eval_name="${parts[0]:-}"
config_key="${parts[1]:-}"
conc="${parts[2]:-}"
master_override="${parts[3]:-}"
if [[ -z "$eval_name" || -z "$config_key" ]]; then
echo "usage: /run-evals <eval> <config-key> [conc] [master-config]" >&2
echo "valid evals: gsm8k | gpqa | swebench_lite" >&2
exit 1
fi

# Map <eval> -> (framework, task YAML).
case "$eval_name" in
gsm8k) framework="lm-eval"; task="utils/evals/gsm8k.yaml" ;;
gpqa|gpqa_diamond) framework="lm-eval"; task="utils/evals/gpqa_diamond.yaml" ;;
swebench|swebench_lite) framework="swebench"; task="utils/evals/swebench_lite.yaml" ;;
*)
echo "unknown eval '$eval_name' (valid: gsm8k, gpqa, swebench_lite)" >&2
exit 1
;;
esac

if [[ -n "$conc" && ! "$conc" =~ ^[1-9][0-9]*$ ]]; then
echo "conc must be a positive integer, got '$conc'" >&2
exit 1
fi

# Pick the platform master config from the config-key's hardware token,
# unless an explicit 4th arg overrides it.
if [[ -n "$master_override" ]]; then
master="$master_override"
elif [[ "$config_key" =~ (b200|b300|h100|h200|gb200|gb300) ]]; then
master=".github/configs/nvidia-master.yaml"
elif [[ "$config_key" =~ (mi300x|mi325x|mi355x) ]]; then
master=".github/configs/amd-master.yaml"
else
echo "cannot infer platform from config-key '$config_key'; pass the master config path as a 4th arg" >&2
exit 1
fi

gen="test-config --config-files ${master} --config-keys ${config_key} --evals-only"
if [[ -n "$conc" ]]; then
gen="${gen} --conc ${conc}"
fi

{
echo "generator-args=${gen}"
echo "eval-framework=${framework}"
echo "eval-task=${task}"
echo "pr-number=${PR_NUMBER}"
} >> "$GITHUB_OUTPUT"

- name: Check author permissions
id: auth
if: ${{ github.event_name == 'issue_comment' && github.event.issue.pull_request }}
uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
with:
script: |
const owner = context.repo.owner;
const repo = context.repo.repo;
const username = context.payload.comment?.user?.login;
let permission = 'none';
try {
const res = await github.rest.repos.getCollaboratorPermissionLevel({ owner, repo, username });
permission = res.data?.permission || 'none';
} catch (e) {
permission = 'none';
}
const canBypass = ['admin','maintain','write'].includes(permission);
core.info(`Author ${username} permission: ${permission}; bypass=${canBypass}`);
core.setOutput('can-bypass', canBypass ? 'true' : 'false');

# ---- PR SHA pinning ----
- name: Resolve immutable PR ref (pin to head SHA)
id: ref_comment
if: ${{ github.event_name == 'issue_comment' && github.event.issue.pull_request && startsWith(github.event.comment.body, '/run-evals') }}
uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
with:
script: |
const owner = context.repo.owner;
const repo = context.repo.repo;
const pr = context.issue.number;
const res = await github.rest.pulls.get({ owner, repo, pull_number: pr });
const sha = res.data.head.sha;
core.info(`Resolved PR #${pr} head SHA: ${sha}`);
core.setOutput('ref', sha);

- name: Reply with run link
if: ${{ github.event_name == 'issue_comment' && startsWith(github.event.comment.body, '/run-evals') && github.repository_owner == 'SemiAnalysisAI' }}
uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
continue-on-error: true
env:
RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
AUTHOR: ${{ github.event.comment.user.login }}
GEN_CMD: ${{ steps.parse.outputs.generator-args }}
EVAL_FRAMEWORK: ${{ steps.parse.outputs.eval-framework }}
CAN_BYPASS: ${{ steps.auth.outputs.can-bypass }}
PINNED_REF: ${{ steps.ref_comment.outputs.ref }}
with:
github-token: ${{ github.token }}
script: |
const owner = context.repo.owner;
const repo = context.repo.repo;
const issue_number = context.issue.number;
const runUrl = process.env.RUN_URL;
const author = process.env.AUTHOR;
const genCmd = process.env.GEN_CMD || '';
const framework = process.env.EVAL_FRAMEWORK || '';
const canBypass = (process.env.CAN_BYPASS || '').toLowerCase() === 'true';
const pinned = process.env.PINNED_REF || '';
const shortSha = pinned ? pinned.slice(0, 7) : '';
const approvalMsg = canBypass ? 'Approval: not required (trusted collaborator).' : "Approval: required in environment 'Outside Collaborator E2E Test'.";
const body = `@${author} Kicking off an eval-only run (framework: \`${framework}\`).\n\nRun: ${runUrl}\nCommand: \`${genCmd}\`\nPinned ref: \`${shortSha}\`\n${approvalMsg}`;
await github.rest.issues.createComment({ owner, repo, issue_number, body });

approval:
needs: get-jobs
if: ${{ github.event_name == 'issue_comment' && needs.get-jobs.outputs.pr-number != '' && needs.get-jobs.outputs.generator-args != '' && needs.get-jobs.outputs.author-can-bypass != 'true' }}
runs-on: ubuntu-latest
name: approval
environment: Outside Collaborator E2E Test
steps:
- run: echo "approved"

validate:
needs: [get-jobs, approval]
# always() is required to evaluate this condition when 'approval' is skipped (trusted author)
if: ${{ always() && needs.get-jobs.result == 'success' && needs.get-jobs.outputs.pr-number != '' && needs.get-jobs.outputs.generator-args != '' && (needs.get-jobs.outputs.author-can-bypass == 'true' || needs.approval.result == 'success') }}
# Concurrency at job level so non-/run-evals comments don't cancel active runs
concurrency:
group: "run-evals-PR#${{ needs.get-jobs.outputs.pr-number }}"
cancel-in-progress: true
uses: ./.github/workflows/e2e-tests.yml
name: validate
secrets: inherit
with:
generate-cli-command: ${{ needs.get-jobs.outputs.generator-args }}
eval-framework: ${{ needs.get-jobs.outputs.eval-framework }}
eval-task: ${{ needs.get-jobs.outputs.eval-task }}
test-name: PR #${{ needs.get-jobs.outputs.pr-number }} /run-evals
# Use pinned SHA to prevent TOCTOU on refs/pull/<n>/head
ref: ${{ needs.get-jobs.outputs.ref }}
Loading