diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 4c4ff67b8..1911b2587 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1312,6 +1312,266 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
           - "DECODE_NODES=1"
           - "DECODE_MTP_SIZE=2"
 
+minimaxm2.5-fp8-mi300x-vllm-disagg:
+  image: ghcr.io/simondanielsson/vllm/vllm-openai-rocm:fix-moriio-hangs-high-concurrency
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: mi300x-disagg
+  precision: fp8
+  framework: vllm-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [ 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # Top of curve: 2P1D
+      - spec-decoding: "none"
+        conc-list: [256, 512, 1024, 2048 ]
+        prefill:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=2"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+
+      # Bottom of curve: 1P2D
+      - spec-decoding: "none"
+        conc-list: [8, 16, 32, 64, 128]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+
+minimaxm2.5-fp8-mi325x-vllm-disagg:
+  image: ghcr.io/simondanielsson/vllm/vllm-openai-rocm:fix-moriio-hangs-high-concurrency
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: mi325x-disagg
+  precision: fp8
+  framework: vllm-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [ 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # Top of curve: 2P1D
+      - spec-decoding: "none"
+        conc-list: [256, 512, 1024, 2048 ]
+        prefill:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=2"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+
+      # Bottom of curve: 1P2D
+      - spec-decoding: "none"
+        conc-list: [8, 16, 32, 64, 128]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+
+kimik2.5-fp4-mi355x-vllm-disagg:
+  image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036
+  model: amd/Kimi-K2.5-MXFP4
+  model-prefix: kimik2.5
+  runner: mi355x-disagg
+  precision: fp4
+  framework: vllm-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total 
+      - spec-decoding: "none"
+        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+
+minimaxm2.5-fp8-mi355x-vllm-disagg:
+  image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: mi355x-disagg
+  precision: fp8
+  framework: vllm-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total
+      # Prefill also needs EP=8: MiniMax M2.5 expert intermediate_size=1536,
+      # TP8 shards to 192 which is not divisible by FP8 block_n=128.
+      - spec-decoding: "none"
+        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+
 dsr1-fp4-mi355x-sglang-disagg:
   image: lmsysorg/sglang-rocm:v0.5.10.post1-rocm720-mi35x-20260501
   model: amd/DeepSeek-R1-0528-MXFP4-v2
diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
index f901b1ff7..5880de49b 100644
--- a/.github/workflows/benchmark-multinode-tmpl.yml
+++ b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -271,7 +271,9 @@ jobs:
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: multinode_server_logs_${{ env.RESULT_FILENAME }}
-          path: multinode_server_logs.tar.gz
+          path: |
+            multinode_server_logs.tar.gz
+            benchmark_artifacts/
           if-no-files-found: ignore
 
       - name: Upload agentic aggregated result
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 677df68c1..e4c9c5fb1 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -204,10 +204,12 @@ run_benchmark_serving() {
     local result_filename=""
     local result_dir=""
     local workspace_dir=""
+    local tokenizer=""
     local use_chat_template=false
     local dsv4=false
     local trust_remote_code=false
     local server_pid=""
+    local tokenizer=""
 
     while [[ $# -gt 0 ]]; do
         case $1 in
@@ -268,6 +270,10 @@ run_benchmark_serving() {
                 use_chat_template=true
                 shift
                 ;;
+            --tokenizer)
+                tokenizer="$2"
+                shift 2
+                ;;
             --trust-remote-code)
                 trust_remote_code=true
                 shift
@@ -276,6 +282,10 @@ run_benchmark_serving() {
                 server_pid="$2"
                 shift 2
                 ;;
+            --tokenizer)
+                tokenizer="$2"
+                shift 2
+                ;;
             *)
                 echo "Unknown parameter: $1"
                 return 1
@@ -383,6 +393,10 @@ run_benchmark_serving() {
         benchmark_cmd+=(--trust-remote-code)
     fi
 
+    if [[ -n "$tokenizer" ]]; then
+        benchmark_cmd+=(--tokenizer "$tokenizer")
+    fi
+
     # Run benchmark with optional server monitoring
     set -x
     if [[ -n "$server_pid" ]]; then
diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh
index ac996c5a9..05384f435 100755
--- a/benchmarks/multi_node/amd_utils/bench.sh
+++ b/benchmarks/multi_node/amd_utils/bench.sh
@@ -1,4 +1,17 @@
 #!/bin/bash
+# Dual-Engine Disaggregated Benchmark Runner
+#
+# ENGINE=sglang (default): SGLang benchmark
+# ENGINE=vllm:             vLLM benchmark
+#
+# Produces JSON result files via benchmark_serving.py so that the CI pipeline
+# can collect and process results.
+#
+# Usage: bash bench.sh <n_prefill> <n_decode> <prefill_gpus> <decode_gpus> \
+#            <model_dir> <model_name> <log_path> <isl> <osl> \
+#            <concurrency_list> <req_rate> <random_range_ratio> <num_prompts_multiplier>
+
+ENGINE="${ENGINE:-sglang-disagg}"
 
 n_prefill=$1
 n_decode=$2
@@ -6,58 +19,90 @@ prefill_gpus=$3
 decode_gpus=$4
 model_path=$5
 model_name=$6
-MODEL_PATH="${model_path}/${model_name}"
+MODEL_PATH="${MODEL_PATH:-${model_path}/${model_name}}"
+# vllm-disagg uses --served-model-name MODEL_NAME; sglang defaults to MODEL_PATH
+if [[ "$ENGINE" == "vllm-disagg" ]]; then
+    BENCH_MODEL="${MODEL_NAME:-${MODEL_PATH}}"
+else
+    BENCH_MODEL="${MODEL_PATH}"
+fi
 log_path=$7
 
 chosen_isl=${8:-1024}
 chosen_osl=${9:-1024}
 concurrency_list=${10:-"512x1"}
-chosen_req_rate=${11:-1}
+if [[ "$ENGINE" == "vllm-disagg" ]]; then
+    chosen_req_rate=${11:-inf}
+else
+    chosen_req_rate=${11:-1}
+fi
 random_range_ratio=${12:-0.8}
 num_prompts_multiplier=${13:-10}
 
 IFS='x' read -r -a chosen_concurrencies <<< "$concurrency_list"
 
-echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}"
+ROUTER_PORT="${ROUTER_PORT:-30000}"
 
-head_node="localhost"
-head_port="30000"
+export TRANSFORMERS_VERBOSITY=error
+export TOKENIZERS_PARALLELISM=false
 
+echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}"
 
-profile_folder="${log_path}/sglang_isl_${chosen_isl}_osl_${chosen_osl}"
-mkdir -p $profile_folder
+profile_folder="${log_path}/${ENGINE}_isl_${chosen_isl}_osl_${chosen_osl}"
+mkdir -p "$profile_folder"
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-# Repo root inside the container (3 levels up from this script's directory)
 REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)"
 
-for max_concurrency in ${chosen_concurrencies[@]}; do
+for max_concurrency in "${chosen_concurrencies[@]}"; do
 
     export_file="${profile_folder}/concurrency_${max_concurrency}_req_rate_${chosen_req_rate}_gpus_$((prefill_gpus+decode_gpus))_ctx_${prefill_gpus}_gen_${decode_gpus}"
 
+    num_prompts=$(( max_concurrency * num_prompts_multiplier ))
+    if [[ "$num_prompts" -lt 16 ]]; then
+        num_prompts=16
+    fi
+
     echo "profile_folder: $profile_folder"
     echo "max_concurrency: $max_concurrency"
     echo "chosen_req_rate: $chosen_req_rate"
     echo "MODEL_PATH: $MODEL_PATH"
-    echo "head_port: $head_port"
+    echo "ROUTER_PORT: $ROUTER_PORT"
     echo "chosen_isl: $chosen_isl"
     echo "chosen_osl: $chosen_osl"
+    echo "num_prompts: $num_prompts"
     echo "export_file: $export_file"
 
+    # Engine-specific extra flags
+    extra_flags=""
+    if [[ "$ENGINE" == "vllm-disagg" ]]; then
+        extra_flags="--trust-remote-code --tokenizer $MODEL_PATH"
+    else
+        if [ "$IS_MTP" = "true" ]; then
+            extra_flags="--use-chat-template"
+        fi
+    fi
+
     run_benchmark_serving \
         --bench-serving-dir "$REPO_ROOT" \
-        --model  ${MODEL_PATH} \
-        --port ${head_port} \
+        --model "$BENCH_MODEL" \
+        --port "$ROUTER_PORT" \
         --backend openai \
-        --input-len ${chosen_isl} \
-        --output-len ${chosen_osl} \
-        --random-range-ratio ${random_range_ratio} \
-        --num-prompts $(( $max_concurrency * $num_prompts_multiplier )) \
+        --input-len "$chosen_isl" \
+        --output-len "$chosen_osl" \
+        --random-range-ratio "$random_range_ratio" \
+        --num-prompts "$num_prompts" \
         --max-concurrency "$max_concurrency" \
         --result-filename "$export_file" \
         --result-dir /workspace/ \
-        $( [ "$IS_MTP" = "true" ] && echo "--use-chat-template" )
+        $extra_flags
 
     echo "-----------------------------------------"
+
+    # vLLM: cooldown between rounds for idle KV block reaper
+    if [[ "$ENGINE" == "vllm-disagg" ]]; then
+        echo "[BENCH] Cooldown: waiting 10s for idle KV block reaper..."
+        sleep 10
+    fi
 done
diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index d0b99eddc..50001766f 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -1,109 +1,212 @@
 #!/bin/bash
-# SGLang/MoRI environment setup for multi-node disaggregated serving.
+# Dual-engine environment setup for multi-node disaggregated serving.
+#
+# ENGINE=sglang (default): SGLang/MoRI environment
+# ENGINE=vllm:             vLLM/Nixl environment
 #
 # REQUIRED ENVIRONMENT VARIABLES:
 #   IBDEVICES - RDMA/InfiniBand device names (e.g., ionic_0,ionic_1,... or mlx5_0,mlx5_1,...)
-#               This must be set by the runner script (runners/launch_mi355x-amds.sh)
-#
-# OPTIONAL ENVIRONMENT VARIABLES:
-#   MORI_RDMA_TC - RDMA traffic class (e.g., 96, 104). Set by runner if cluster uses QoS.
-
+#               Set by runner or auto-detected from hostname.
 set -x
+
+ENGINE="${ENGINE:-sglang-disagg}"
 export PYTHONDONTWRITEBYTECODE=1
 
-# IBDEVICES configuration
+# =============================================================================
+# Shared: IBDEVICES detection
+# =============================================================================
+
 # Prefer IBDEVICES set by runner (runners/launch_mi355x-amds.sh)
 # Fall back to hostname detection if not set (for direct script execution)
 if [[ -z "$IBDEVICES" ]]; then
-    NODENAME=$(hostname -s)
-    if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
-        export IBDEVICES=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_5,ionic_6,ionic_7
-    elif [[ $NODENAME == mia1* ]]; then
-        export IBDEVICES=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
+    DETECTED=$(ibv_devinfo 2>/dev/null | grep "hca_id:" | awk '{print $2}' | paste -sd',')
+    if [[ -n "$DETECTED" ]]; then
+        export IBDEVICES="$DETECTED"
     else
-        echo "ERROR: Unable to detect cluster from hostname $NODENAME and IBDEVICES not set" >&2
-        exit 1
+        echo "WARNING: Unable to detect RDMA devices. Set IBDEVICES explicitly." >&2
     fi
-    echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES from hostname $NODENAME"
+    echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES from hostname $(hostname -s)"
 else
     echo "[INFO] Using IBDEVICES=$IBDEVICES (set by runner or environment)"
 fi
 export IBDEVICES
 
-# Auto-detect default network interface (portable across clusters)
-export GLOO_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1)
-export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1)
+# Shared: Auto-detect default network interface (portable across clusters)
+# Only auto-detect if not already set by the runner/environment
+if [[ -z "$GLOO_SOCKET_IFNAME" ]]; then
+    export GLOO_SOCKET_IFNAME=$(ip route 2>/dev/null | grep '^default' | awk '{print $5}' | head -n 1)
+fi
+if [[ -z "$NCCL_SOCKET_IFNAME" ]]; then
+    export NCCL_SOCKET_IFNAME=$(ip route 2>/dev/null | grep '^default' | awk '{print $5}' | head -n 1)
+fi
 
+set +x
 
-export NCCL_IB_HCA=$IBDEVICES
+export NCCL_IB_HCA=${NCCL_IB_HCA:-$IBDEVICES}
+
+# =============================================================================
+# Engine-specific environment
+# =============================================================================
+
+if [[ "$ENGINE" == "vllm-disagg" ]]; then
+    # =========================================================================
+    # vLLM/Nixl-specific environment
+    # =========================================================================
+    set -x
+
+    export VLLM_MORIIO_QP_PER_TRANSFER=4
+    export VLLM_MORIIO_NUM_WORKERS=4
+    export MORI_IO_SQ_BACKOFF_TIMEOUT_US=50000
+    export MORI_IO_QP_MAX_SEND_WR=16384
+    export MORI_IO_QP_MAX_CQE=32768
+    export MORI_IO_QP_MAX_SGE=4
+    export MORI_IO_TC_DISABLE=0
+
+    # UCX_NET_DEVICES: Use the first tw-eth interface for UCX TCP transport
+    if [[ -z "$UCX_NET_DEVICES" ]]; then
+        UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/tw-eth/{print $2}' | head -1)
+        if [[ -n "$UCX_NET_DEV" ]]; then
+            export UCX_NET_DEVICES="$UCX_NET_DEV"
+        else
+            FIRST_IB=$(echo "$IBDEVICES" | cut -d',' -f1)
+            if [[ -n "$FIRST_IB" ]]; then
+                export UCX_NET_DEVICES="${FIRST_IB}:1"
+            fi
+        fi
+        echo "[INFO] Auto-set UCX_NET_DEVICES=$UCX_NET_DEVICES"
+    else
+        echo "[INFO] Using UCX_NET_DEVICES=$UCX_NET_DEVICES (set by environment)"
+    fi
 
-export SGLANG_USE_AITER=1
+    # RoCEv2: use IPv4-mapped GID (index 1) for inter-node RDMA routing
+    export UCX_IB_GID_INDEX=${UCX_IB_GID_INDEX:-1}
 
-export SGLANG_MORI_DISPATCH_DTYPE=auto
-export SGLANG_MORI_FP8_COMB=true
-export SGLANG_MORI_QP_PER_TRANSFER=4
-export SGLANG_MORI_NUM_WORKERS=4
-export MORI_IO_SQ_BACKOFF_TIMEOUT_US=50000
+    # QoS/DSCP configuration for lossless RoCEv2 fabric.
+    if [[ -n "$UCX_IB_TRAFFIC_CLASS" ]]; then
+        echo "[INFO] Using UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS (set by environment)"
+    elif command -v nicctl &> /dev/null; then
+        ND_PRIO=$(nicctl show qos 2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}')
+        ND_DSCP=$(nicctl show qos 2>/dev/null | awk -v p="$ND_PRIO" '
+$1 == "DSCP" && $2 == ":" && $NF == p {
+    print $3; exit
+}')
+        if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then
+            export UCX_IB_TRAFFIC_CLASS=$(( 4 * ND_DSCP ))
+            export UCX_IB_SL=$ND_PRIO
+            echo "[INFO] Detected QoS from nicctl: UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS, UCX_IB_SL=$UCX_IB_SL"
+        else
+            echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection."
+            NODENAME=$(hostname -s)
+            if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
+                export UCX_IB_TRAFFIC_CLASS=96
+                echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME"
+            elif [[ $NODENAME == mia1* ]]; then
+                export UCX_IB_TRAFFIC_CLASS=104
+                echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME"
+            fi
+        fi
+    else
+        NODENAME=$(hostname -s)
+        if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
+            export UCX_IB_TRAFFIC_CLASS=96
+            echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME"
+        elif [[ $NODENAME == mia1* ]]; then
+            export UCX_IB_TRAFFIC_CLASS=104
+            echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME"
+        else
+            echo "[INFO] No nicctl and unable to detect from hostname. Skipping QoS configuration."
+        fi
+    fi
+
+    set +x
+    echo "[INFO] IBDEVICES=$IBDEVICES  UCX_NET_DEVICES=$UCX_NET_DEVICES  NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME  UCX_IB_GID_INDEX=$UCX_IB_GID_INDEX  UCX_IB_TRAFFIC_CLASS=${UCX_IB_TRAFFIC_CLASS:-unset}"
+
+else
+    # =========================================================================
+    # SGLang/MoRI-specific environment
+    # =========================================================================
+
+    export SGLANG_USE_AITER=1
 
-export MORI_IO_QP_MAX_SEND_WR=16384
-export MORI_IO_QP_MAX_CQE=32768 
-export MORI_IO_QP_MAX_SGE=4
+    export SGLANG_MORI_DISPATCH_DTYPE=auto
+    export SGLANG_MORI_FP8_COMB=true
+    export SGLANG_MORI_QP_PER_TRANSFER=4
+    export SGLANG_MORI_NUM_WORKERS=4
+    export MORI_IO_SQ_BACKOFF_TIMEOUT_US=50000
 
-export MORI_IO_TC_DISABLE=0
+    export MORI_IO_QP_MAX_SEND_WR=16384
+    export MORI_IO_QP_MAX_CQE=32768
+    export MORI_IO_QP_MAX_SGE=4
 
-export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=3600
-export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=3600
+    export MORI_IO_TC_DISABLE=0
 
-# Disable allocating memory in one pass
-export MORI_SHMEM_MODE=ISOLATION
+    export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=3600
+    export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=3600
 
-# Enable spec v2 
-export SGLANG_ENABLE_SPEC_V2=1
-export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1
+    # Disable allocating memory in one pass
+    export MORI_SHMEM_MODE=ISOLATION
 
-export SGLANG_LOG_MS=true
-export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32
+    # Enable spec v2
+    export SGLANG_ENABLE_SPEC_V2=1
+    export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1
 
-export MORI_MAX_DISPATCH_TOKENS_PREFILL=8192
-export MORI_MAX_DISPATCH_TOKENS_DECODE=512
+    export SGLANG_LOG_MS=true
+    export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32
 
-export MORI_MOE_MAX_INPUT_TOKENS_PREFILL=32768
-export MORI_MOE_MAX_INPUT_TOKENS_DECODE=2703
+    export MORI_MAX_DISPATCH_TOKENS_PREFILL=8192
+    export MORI_MAX_DISPATCH_TOKENS_DECODE=512
 
-# set MTP size=1 when EP16
-export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2))
+    export MORI_MOE_MAX_INPUT_TOKENS_PREFILL=32768
+    export MORI_MOE_MAX_INPUT_TOKENS_DECODE=2703
 
-export MORI_EP_LAUNCH_CONFIG_MODE=AUTO
+    # set MTP size=1 when EP16
+    export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2))
 
+    export MORI_EP_LAUNCH_CONFIG_MODE=AUTO
 
-export MORI_APP_LOG_LEVEL=INFO
+    export MORI_APP_LOG_LEVEL=INFO
 
-# Router logging control:
-# 0 (default) keeps noisy per-request access logs out of stdout while still logging to file.
-# 1 mirrors router logs to stdout via tee (useful for live debugging).
-export SGLANG_ROUTER_STDOUT_LOGS="${SGLANG_ROUTER_STDOUT_LOGS:-0}"
+    # Router logging control:
+    # 0 (default) keeps noisy per-request access logs out of stdout while still logging to file.
+    # 1 mirrors router logs to stdout via tee (useful for live debugging).
+    export SGLANG_ROUTER_STDOUT_LOGS="${SGLANG_ROUTER_STDOUT_LOGS:-0}"
 
-# QoS/DSCP configuration
-# Priority order: 1) Set by runner, 2) Detect via nicctl, 3) Detect from hostname
-if [[ -n "$MORI_RDMA_TC" ]]; then
-    echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC (set by runner or environment)"
-elif command -v nicctl &> /dev/null; then
-    ND_PRIO=$(nicctl show qos  2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}')
-    ND_DSCP=$(nicctl show qos 2>/dev/null| awk -v p="$ND_PRIO" '
+    # QoS/DSCP configuration
+    # Priority order: 1) Set by runner, 2) Detect via nicctl, 3) Detect from hostname
+    if [[ -n "$MORI_RDMA_TC" ]]; then
+        echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC (set by runner or environment)"
+    elif command -v nicctl &> /dev/null; then
+        ND_PRIO=$(nicctl show qos  2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}')
+        ND_DSCP=$(nicctl show qos 2>/dev/null| awk -v p="$ND_PRIO" '
 $1 == "DSCP" && $2 == ":" && $NF == p {
     print $3; exit
 }')
 
-    if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then
-        TC=$(( 4 * ND_DSCP ))
-        export MORI_RDMA_SL=$ND_PRIO
-        export MORI_IO_SL=$ND_PRIO
-        export MORI_RDMA_TC=$TC
-        export MORI_IO_TC=$TC
-        echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL, MORI_IO_TC=$MORI_IO_TC, MORI_IO_SL=$MORI_IO_SL"
+        if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then
+            TC=$(( 4 * ND_DSCP ))
+            export MORI_RDMA_SL=$ND_PRIO
+            export MORI_IO_SL=$ND_PRIO
+            export MORI_RDMA_TC=$TC
+            export MORI_IO_TC=$TC
+            echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL, MORI_IO_TC=$MORI_IO_TC, MORI_IO_SL=$MORI_IO_SL"
+        else
+            echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection."
+            # Fall back to hostname-based detection
+            NODENAME=$(hostname -s)
+            if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
+                export MORI_RDMA_TC=96
+                export MORI_IO_TC=96
+                echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
+            elif [[ $NODENAME == mia1* ]]; then
+                export MORI_RDMA_TC=104
+                export MORI_IO_TC=104
+                echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
+            else
+                echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration."
+            fi
+        fi
     else
-        echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection."
-        # Fall back to hostname-based detection
+        # nicctl not available, try hostname-based detection
         NODENAME=$(hostname -s)
         if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
             export MORI_RDMA_TC=96
@@ -114,28 +217,12 @@ $1 == "DSCP" && $2 == ":" && $NF == p {
             export MORI_IO_TC=104
             echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
         else
-            echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration."
+            echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration."
+            echo "       This is normal for clusters without QoS or outside Docker containers."
         fi
     fi
-else
-    # nicctl not available, try hostname-based detection
-    NODENAME=$(hostname -s)
-    if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
-        export MORI_RDMA_TC=96
-        export MORI_IO_TC=96
-        echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
-    elif [[ $NODENAME == mia1* ]]; then
-        export MORI_RDMA_TC=104
-        export MORI_IO_TC=104
-        echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
-    else
-        echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration."
-        echo "       This is normal for clusters without QoS or outside Docker containers."
-    fi
-fi
-
-# FIXME: WA for latest upstream 0305 image
-export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH}
 
+    # FIXME: WA for latest upstream 0305 image
+    export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH}
 
-set +x
+fi
diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
index 824605c46..3d2d9a1fd 100755
--- a/benchmarks/multi_node/amd_utils/job.slurm
+++ b/benchmarks/multi_node/amd_utils/job.slurm
@@ -1,265 +1,297 @@
 #!/bin/bash
-#SBATCH --job-name=1p2d_bench-serving    # Specify a custom string for your slurm batch job
-#SBATCH -N 3            # CHECK this to be right in batch jobs
-#SBATCH -n 3          # CHECK this to be right in batch jobs
+#SBATCH --job-name=disagg-bench
+#SBATCH -N 3            # Overridden by submit.sh -N flag
+#SBATCH -n 3            # Overridden by submit.sh -n flag
 #SBATCH --ntasks-per-node=1
 #SBATCH --spread-job
-#SBATCH --gres=gpu:8      # Request 8 GPUs and 8 NICs (use --gres if specific GPU resources are needed)
-#SBATCH --time=24:00:00         # Set a time limit for the job (HH:MM:SS)
+#SBATCH --gres=gpu:8
+#SBATCH --time=24:00:00
 # --output and --error are set by submit.sh via BENCHMARK_LOGS_DIR
 
+ENGINE="${ENGINE:-sglang-disagg}"
 
-# ------------------------
-# Print current time in UTC and PST formats
-# ------------------------
 echo "=== Job Start Time ==="
 echo "UTC Time: $(TZ=UTC date '+%Y-%m-%d %H:%M:%S %Z')"
 echo "PST Time: $(TZ=America/Los_Angeles date '+%Y-%m-%d %H:%M:%S %Z')"
+echo "ENGINE: $ENGINE"
 echo "======================="
 echo ""
 
 # =============================================================================
-# Model validation from models.yaml (replaces hardcoded VALID_MODELS array)
+# Model Validation
 # =============================================================================
-# DI_REPO_DIR is set below from $(pwd); use the submit-time working directory
-# because sbatch copies this script to /var/spool/slurmd/ at runtime.
-MODELS_YAML="$(pwd)/models.yaml"
+
+# Use $(pwd) not BASH_SOURCE — sbatch copies the script to /var/spool/slurmd/
+# at runtime, but the CWD remains the submit-time directory (amd_utils/).
+if [[ "$ENGINE" == "vllm-disagg" ]]; then
+    MODELS_YAML="$(pwd)/models_vllm.yaml"
+else
+    MODELS_YAML="$(pwd)/models.yaml"
+fi
 
 if [[ ! -f "$MODELS_YAML" ]]; then
-    echo "Error: models.yaml not found at $MODELS_YAML"
+    echo "Error: models YAML not found at $MODELS_YAML"
+    exit 1
+fi
+
+if [[ -z "${DOCKER_IMAGE_NAME:-}" ]]; then
+    echo "Error: DOCKER_IMAGE_NAME is not set."
     exit 1
 fi
 
-# Validate MODEL_NAME exists as a top-level key in models.yaml
+MODEL_NAME="${MODEL_NAME:-None}"
 if ! grep -q "^${MODEL_NAME}:" "$MODELS_YAML"; then
-    echo "Error: Model '$MODEL_NAME' not found in models.yaml"
+    echo "Error: Model '$MODEL_NAME' not found in $MODELS_YAML"
     echo "Available models:"
     grep -E '^[A-Za-z]' "$MODELS_YAML" | sed 's/:.*$//' | sed 's/^/  - /'
     exit 1
 fi
 echo "Model found: $MODEL_NAME"
 
-# All models use server.sh as the entrypoint
 RUN_FILE="server.sh"
 echo "Runfile set: $RUN_FILE"
 
-if [[ -z "${DOCKER_IMAGE_NAME:-}" ]]; then
-    echo "Error: DOCKER_IMAGE_NAME is not set."
-    exit 1
-fi
-
-# DI_REPO_DIR points to the repo root so Docker can access both benchmarks/ and utils/.
+# DI_REPO_DIR points to the repo root.
 # $(pwd) is amd_utils/ (the sbatch submit dir); go up 3 levels to reach the repo root.
 export DI_REPO_DIR=$(cd "$(pwd)/../../.." && pwd)
 
-xP="${xP:-1}" #-> Number of Prefill Workers
-yD="${yD:-1}" #-> Number of Decode Workers
+xP="${xP:-1}"
+yD="${yD:-1}"
 
-# Parallelism Configuration with defaults
-PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}"
-PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-true}"
-PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-true}"
-DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}"
-DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-true}"
-DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-true}"
-DECODE_MTP_SIZE=${DECODE_MTP_SIZE:-0} # 0 for disabling MTP
-
-# Benchmark Configuration with defaults
+# Benchmark configuration
 BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}"
 BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}"
 BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}"
 BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}"
 BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}"
+BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}"
 
 GPUS_PER_NODE="${GPUS_PER_NODE:-8}"
 
-MODEL_NAME="${MODEL_NAME:-None}"
+# Engine-specific defaults
+PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-false}"
+PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-false}"
+DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-false}"
+DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-false}"
+PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}"
+DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}"
+DECODE_MTP_SIZE=${DECODE_MTP_SIZE:-0}
+
+# Router selection: "vllm-router" (external container) or "moriio" (in-container proxy)
+ROUTER_TYPE="${ROUTER_TYPE:-vllm-router}"
+ROUTER_PORT="${ROUTER_PORT:-30000}"
+PROXY_PING_PORT="${PROXY_PING_PORT:-36367}"
+
+# =============================================================================
+# Model Path Resolution
+# =============================================================================
 
 # MODEL_DIR detection: prefer env var, fall back to hostname detection
 if [[ -z "$MODEL_DIR" ]]; then
     NODENAME=$(hostname -s)
     if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
         MODEL_DIR="/nfsdata"
-        echo "[INFO] Auto-detected MODEL_DIR=$MODEL_DIR from hostname $NODENAME"
     elif [[ $NODENAME == mia1* ]]; then
         MODEL_DIR="/it-share/data"
-        echo "[INFO] Auto-detected MODEL_DIR=$MODEL_DIR from hostname $NODENAME"
     else
-        MODEL_DIR="/nfsdata"  # Default fallback
-        echo "[INFO] Using default MODEL_DIR=$MODEL_DIR (hostname $NODENAME not recognized)"
+        MODEL_DIR="/nfsdata"
     fi
+    echo "[INFO] Auto-detected MODEL_DIR=$MODEL_DIR from hostname $(hostname -s)"
 fi
 export MODEL_DIR
 
-# ------------------------
-# Model path validation and selection across all nodes
-# ------------------------
-echo "Looking for model: $MODEL_NAME"
-echo "Checking model availability across all allocated nodes..."
-
-# Get all allocated nodes
-ALL_NODES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
-TOTAL_NODES=$(echo "$ALL_NODES" | wc -l)
-
-echo "Total allocated nodes: $TOTAL_NODES"
-echo "Nodes: $(echo "$ALL_NODES" | tr '\n' ' ')"
-
-# Function to check model path on all nodes
-check_model_path() {
-    local path=$1
-    local check_name=$2
-
-    echo "Checking $check_name: $path"
+if [[ "$ENGINE" == "vllm-disagg" ]]; then
+    # vLLM: Extract hf_dir from models.yaml, search multiple paths, resolve HF cache snapshots
+    DISK_DIR_NAME=$(awk '/^'"$MODEL_NAME"':/{found=1; next}
+        found && /^[^ ]/{exit}
+        found && /hf_dir:/{gsub(/[" ]/, "", $2); print $2; exit}' "$MODELS_YAML")
+    DISK_DIR_NAME="${DISK_DIR_NAME:-$MODEL_NAME}"
+    echo "Looking for model: $MODEL_NAME (disk dir: $DISK_DIR_NAME)"
+
+    resolve_hf_cache_path() {
+        local base_path=$1
+        if [[ -d "${base_path}/snapshots" ]]; then
+            local snapshot=$(ls -1 "${base_path}/snapshots" 2>/dev/null | head -1)
+            if [[ -n "$snapshot" ]]; then
+                echo "${base_path}/snapshots/${snapshot}"
+                return 0
+            fi
+        fi
+        echo "$base_path"
+        return 1
+    }
+
+    MODEL_PATH=""
+    SEARCH_PATHS=(
+        "${MODEL_DIR}/${DISK_DIR_NAME}"
+        "${MODEL_DIR}/${MODEL_NAME}"
+        "/nfsdata/hf_hub_cache-0/${DISK_DIR_NAME}"
+        "/nfsdata/hf_hub_cache-0/${MODEL_NAME}"
+    )
+
+    for search_path in "${SEARCH_PATHS[@]}"; do
+        if [[ -d "$search_path" ]]; then
+            RESOLVED=$(resolve_hf_cache_path "$search_path")
+            MODEL_PATH="$RESOLVED"
+            echo "Found MODEL_PATH: $MODEL_PATH"
+            break
+        fi
+    done
 
-    # Run check on all nodes in parallel
-    srun --nodes=$SLURM_NNODES --ntasks=$SLURM_NNODES /bin/bash -c "
-        if [ -d '$path' ]; then
-            echo \"\$(hostname): ✓ Found $path\"
-            exit 0
+    if [[ -z "$MODEL_PATH" ]]; then
+        echo "FATAL: Model '$MODEL_NAME' not found. Searched:"
+        for p in "${SEARCH_PATHS[@]}"; do echo "  - $p"; done
+        exit 1
+    fi
+    echo "Final MODEL_PATH: $MODEL_PATH"
+else
+    # SGLang: Validate model path across all allocated nodes
+    echo "Looking for model: $MODEL_NAME"
+    echo "Checking model availability across all allocated nodes..."
+
+    ALL_NODES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
+    TOTAL_NODES=$(echo "$ALL_NODES" | wc -l)
+    echo "Total allocated nodes: $TOTAL_NODES"
+    echo "Nodes: $(echo "$ALL_NODES" | tr '\n' ' ')"
+
+    check_model_path() {
+        local path=$1
+        local check_name=$2
+        echo "Checking $check_name: $path"
+        srun --nodes=$SLURM_NNODES --ntasks=$SLURM_NNODES /bin/bash -c "
+            if [ -d '$path' ]; then
+                echo \"\$(hostname): Found $path\"
+                exit 0
+            else
+                echo \"\$(hostname): Missing $path\"
+                exit 1
+            fi
+        "
+        local exit_code=$?
+        if [ $exit_code -eq 0 ]; then
+            echo "$check_name available on ALL nodes"
+            return 0
         else
-            echo \"\$(hostname): ✗ Missing $path\"
-            exit 1
+            echo "$check_name NOT available on all nodes"
+            return 1
         fi
-    "
+    }
 
-    # Check if all nodes succeeded (exit code 0)
-    local exit_code=$?
-    if [ $exit_code -eq 0 ]; then
-        echo "✓ $check_name available on ALL nodes"
-        return 0
+    if check_model_path "$MODEL_DIR/$MODEL_NAME" "$MODEL_DIR"; then
+        MODEL_PATH="$MODEL_DIR/$MODEL_NAME"
+        echo "Selected MODEL_PATH: $MODEL_PATH (available on all nodes)"
     else
-        echo "✗ $check_name NOT available on all nodes"
-        return 1
+        echo "FATAL ERROR: Model '$MODEL_NAME' not found on ALL allocated nodes in:"
+        echo "  - $MODEL_DIR/$MODEL_NAME"
+        exit 1
     fi
-}
-
-# Check model weights exist on "$MODEL_DIR/$MODEL_NAME"
-if check_model_path "$MODEL_DIR/$MODEL_NAME" "$MODEL_DIR"; then
-    MODEL_PATH="$MODEL_DIR/$MODEL_NAME"
-    echo ""
-    echo "✓ Selected MODEL_PATH: $MODEL_PATH (available on all nodes)"
-else
-    echo ""
-    echo "✗ FATAL ERROR: Model '$MODEL_NAME' not found on ALL allocated nodes in the following:"
-    echo "  - $MODEL_DIR/$MODEL_NAME"
-    echo ""
-    echo "Model must be accessible from all nodes for distributed execution."
-    echo "Please ensure the model is available on all allocated nodes."
-    exit 1
+    echo "Final MODEL_PATH: $MODEL_PATH"
 fi
 
-echo "Final MODEL_PATH: $MODEL_PATH"
-echo ""
-
-NUM_NODES="${NUM_NODES}"
+# =============================================================================
+# Node Selection
+# =============================================================================
 
-# ------------------------
-# Extract first NUM_NODES from SLURM allocation and update SLURM variables
-# ------------------------
-echo "Original SLURM allocation:"
-echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
-echo "SLURM_NNODES: $SLURM_NNODES"
-echo "SLURM_NTASKS: $SLURM_NTASKS"
+NUM_NODES=$((xP + yD))
+echo "NUM_NODES: $NUM_NODES (xP=$xP + yD=$yD)"
 
-# Get the full nodelist and extract first NUM_NODES
 FULL_NODELIST=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
 SELECTED_NODES=$(echo "$FULL_NODELIST" | head -n $NUM_NODES)
 SELECTED_NODELIST_STR=$(echo "$SELECTED_NODES" | tr '\n' ',' | sed 's/,$//')
 
-# Create new nodelist in SLURM format
-# This is a simplified approach - for complex ranges, you might need more sophisticated parsing
-NEW_SLURM_NODELIST=$(echo "$SELECTED_NODES" | paste -sd, | sed 's/,/,/g')
+# Docker privilege detection — evaluated per-node since group membership varies.
+# Exported as a snippet so every srun participant resolves it locally.
+#
+# Middle branch (sg-docker shim) is a workaround for stale supplementary groups
+# on long-running GHA runners: the kernel-level group list is frozen at runner
+# start time and predates the gharunner→docker group add, even though NSS now
+# lists gharunner in docker. sg(1) is setuid root, so it can set the docker GID
+# per invocation; we wrap docker in a PATH shim so xargs/etc. also work.
+# Remove this branch once the runners are restarted.
+export DOCKER_CMD_DETECT='
+if docker ps &>/dev/null 2>&1; then
+    DOCKER_CMD=docker
+elif command -v sg >/dev/null 2>&1 && sg docker -c "docker ps" &>/dev/null 2>&1; then
+    SHIM_DIR="/tmp/docker-sg-shim-$$"
+    mkdir -p "$SHIM_DIR"
+    cat >"$SHIM_DIR/docker" <<"SHIM_EOF"
+#!/bin/bash
+F=$(mktemp)
+printf "%s\0" "$@" > "$F"
+sg docker -c "xargs -0 -a $F /usr/bin/docker"
+rc=$?
+rm -f "$F"
+exit $rc
+SHIM_EOF
+    chmod +x "$SHIM_DIR/docker"
+    export PATH="$SHIM_DIR:$PATH"
+    DOCKER_CMD=docker
+    echo "[docker-detect] using sg-docker shim on $(hostname)" >&2
+elif command -v sudo >/dev/null 2>&1 && sudo -n docker ps &>/dev/null 2>&1; then
+    DOCKER_CMD="sudo -n docker"
+    echo "[docker-detect] using passwordless sudo docker on $(hostname)" >&2
+else
+    out=$(docker ps 2>&1 || true)
+    echo "[docker-detect] no usable docker command on $(hostname) as $(id -un)" >&2
+    echo "[docker-detect] id: $(id)" >&2
+    echo "[docker-detect] groups: $(groups 2>/dev/null || true)" >&2
+    ls -l /var/run/docker.sock >&2 2>/dev/null || true
+    echo "[docker-detect] docker ps: $out" >&2
+    echo "[docker-detect] sg docker: $(command -v sg >/dev/null 2>&1 && sg docker -c "docker ps" 2>&1 || true)" >&2
+    echo "[docker-detect] sudo docker: $(command -v sudo >/dev/null 2>&1 && sudo -n docker ps 2>&1 || true)" >&2
+    exit 1
+fi'
 
 # Update SLURM environment variables
 export SLURM_NNODES=$NUM_NODES
 export SLURM_NTASKS=$NUM_NODES
 export SLURM_JOB_NUM_NODES=$NUM_NODES
 export SLURM_NPROCS=$NUM_NODES
-export SLURM_JOB_NODELIST="$NEW_SLURM_NODELIST"
-export SLURM_NODELIST="$NEW_SLURM_NODELIST"
-
-# Keep other SLURM variables as they were or set defaults
+export SLURM_JOB_NODELIST="$SELECTED_NODELIST_STR"
+export SLURM_NODELIST="$SELECTED_NODELIST_STR"
 export SLURM_TASKS_PER_NODE="1(x$NUM_NODES)"
-export SLURM_SUBMIT_DIR="${SLURM_SUBMIT_DIR:-$HOME}"
-export SLURM_CLUSTER_NAME="${SLURM_CLUSTER_NAME}"  # Let SLURM set this automatically
-export SLURM_JOB_CPUS_PER_NODE="${SLURM_JOB_CPUS_PER_NODE}"
-export SLURM_JOB_PARTITION="${SLURM_JOB_PARTITION}"  # Should be set by sbatch/runner
-export SLURM_JOBID="${SLURM_JOBID:-$SLURM_JOB_ID}"
-export SLURM_JOB_QOS="${SLURM_JOB_QOS}"  # Should be set by sbatch/runner if needed
-export SLURM_JOB_ACCOUNT="${SLURM_JOB_ACCOUNT}"  # Should be set by sbatch/runner
 export SLURM_NTASKS_PER_NODE=1
-export SLURM_SUBMIT_HOST="${SLURM_SUBMIT_HOST}"
-export SLURM_JOB_ID="${SLURM_JOB_ID}"
-# SLURM_CONF is auto-set by SLURM, no need to override
-export SLURM_JOB_NAME="${SLURM_JOB_NAME:-1p1d_bench-serving}"
 
 echo ""
-echo "Updated SLURM Environment Variables:"
-echo "SLURM_JOB_ID: $SLURM_JOB_ID"
-echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
-echo "SLURM_NNODES: $SLURM_NNODES"
-echo "SLURM_NTASKS: $SLURM_NTASKS"
-echo "SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE"
-echo "SLURM_JOB_CPUS_PER_NODE: $SLURM_JOB_CPUS_PER_NODE"
-echo "SLURM_JOB_PARTITION: $SLURM_JOB_PARTITION"
-echo "SLURM_JOB_NUM_NODES: $SLURM_JOB_NUM_NODES"
-echo "SLURM_JOBID: $SLURM_JOBID"
-echo "SLURM_JOB_QOS: $SLURM_JOB_QOS"
-echo "SLURM_NODELIST: $SLURM_NODELIST"
-echo "SLURM_JOB_ACCOUNT: $SLURM_JOB_ACCOUNT"
-echo "SLURM_NPROCS: $SLURM_NPROCS"
-echo "SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST"
-echo "SLURM_CONF: $SLURM_CONF"
-echo "SLURM_JOB_NAME: $SLURM_JOB_NAME"
-echo "SLURM_NTASKS_PER_NODE: $SLURM_NTASKS_PER_NODE"
-echo "SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR"
-echo "SLURM_CLUSTER_NAME: $SLURM_CLUSTER_NAME"
-echo "ulimit: $(ulimit -a)"
-echo ""
-echo "Selected nodes for execution:"
-echo "$SELECTED_NODES"
-echo ""
+echo "Selected nodes: $SELECTED_NODELIST_STR"
+
+# =============================================================================
+# IP Resolution
+# =============================================================================
 
-# Node information
 USER_NAME=$(whoami)
 MASTER_NODE=$(echo "$SELECTED_NODES" | head -n 1)
 NODE0_ADDR=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$MASTER_NODE" bash -c 'ip route get 1.1.1.1')
 NODE0_ADDR=$(echo "$NODE0_ADDR" | awk '/src/ {print $7}')
 
 IPS=()
-
-GW_NIC=$(ip route | awk '/^default/ {print $5; exit}')
 for NODE in $SELECTED_NODES; do
     IP=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$NODE" bash -c 'ip route get 1.1.1.1')
     IP=$(echo "$IP" | awk '/src/ {print $7}')
     IPS+=("$IP")
 done
 
-echo "Selected node IPs: ${IPS[*]}" | sed 's/ /,/g'
+echo "Node IPs: ${IPS[*]}"
 
 DOCKER_MOUNT_PATH="/workspace"
-SGLANG_WS_PATH="${DOCKER_MOUNT_PATH}/benchmarks/multi_node/amd_utils"
-timestamp=$(date +"%Y-%m-%d_%H-%M-%S")
+WS_PATH="${DOCKER_MOUNT_PATH}/benchmarks/multi_node/amd_utils"
 
 NNODES=$NUM_NODES
 
-echo "MASTER_NODE is ${MASTER_NODE}"
-echo "NODE0_ADDR is ${NODE0_ADDR}"
-echo "NNODES is ${NNODES}"
-echo "REPO Directory is ${DI_REPO_DIR}"
-echo "USER_NAME is ${USER_NAME}"
-
-# Get the RDMA priority and DSCP value from the NIC
-if ! command -v nicctl >/dev/null 2>&1; then
-    echo "Error: nicctl command not found. Please ensure nicctl is installed and available." >&2
-    exit 1
-fi
+echo "MASTER_NODE: ${MASTER_NODE}"
+echo "NODE0_ADDR:  ${NODE0_ADDR}"
+echo "NNODES:      ${NNODES}"
+echo "REPO DIR:    ${DI_REPO_DIR}"
+echo "USER:        ${USER_NAME}"
 
 # Reduce log spam
 export TQDM_MININTERVAL=20
 
+# Translate the host-resolved MODEL_PATH to the Docker mount namespace
+DOCKER_MODEL_PATH="${MODEL_PATH/#$MODEL_DIR//models}"
+
 export DI_REPO_DIR=$DI_REPO_DIR
-export SGLANG_WS_PATH=$SGLANG_WS_PATH
+export WS_PATH=$WS_PATH
 export NNODES=$NNODES
 export NODE0_ADDR=$NODE0_ADDR
 export MODEL_PATH=$MODEL_PATH
@@ -269,21 +301,17 @@ export yD=$yD
 export MODEL_NAME=$MODEL_NAME
 export USER_NAME=$USER_NAME
 export IPADDRS="$(echo "${IPS[*]}" | sed 's/ /,/g')"
-export PREFILL_TP_SIZE=$PREFILL_TP_SIZE
-export PREFILL_ENABLE_EP=$PREFILL_ENABLE_EP
-export PREFILL_ENABLE_DP=$PREFILL_ENABLE_DP
-export DECODE_TP_SIZE=$DECODE_TP_SIZE
-export DECODE_ENABLE_EP=$DECODE_ENABLE_EP
-export DECODE_ENABLE_DP=$DECODE_ENABLE_DP
-export DECODE_MTP_SIZE=$DECODE_MTP_SIZE
 export GPUS_PER_NODE=$GPUS_PER_NODE
 export BENCH_INPUT_LEN=$BENCH_INPUT_LEN
 export BENCH_OUTPUT_LEN=$BENCH_OUTPUT_LEN
 export BENCH_RANDOM_RANGE_RATIO=$BENCH_RANDOM_RANGE_RATIO
 export BENCH_NUM_PROMPTS_MULTIPLIER=$BENCH_NUM_PROMPTS_MULTIPLIER
 export BENCH_MAX_CONCURRENCY=$BENCH_MAX_CONCURRENCY
+export BENCH_REQUEST_RATE=$BENCH_REQUEST_RATE
 export DRY_RUN="${DRY_RUN:-0}"
 export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}"
+export KEEP_CONTAINERS="${KEEP_CONTAINERS:-0}"
+export ENGINE=$ENGINE
 
 # Eval-related env vars (threaded from submit.sh)
 export RUN_EVAL="${RUN_EVAL:-false}"
@@ -298,38 +326,105 @@ export SPEC_DECODING="${SPEC_DECODING:-}"
 export IS_MULTINODE="${IS_MULTINODE:-false}"
 
 SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_')
-export DOCKER_CONT_NAME="container_sbatch_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}"
-export RUN_FILE_FULL="$SGLANG_WS_PATH/${RUN_FILE}"
+export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}"
 
+# vLLM external router container
+VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260629-e667ebb}"
+ROUTER_CONT_NAME="router_vllm_${SANITIZED_USER}_${SLURM_JOB_ID}"
+export RUN_FILE_FULL="$WS_PATH/${RUN_FILE}"
 
-# Use only the selected nodes for srun execution
 SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,)
 
-
 cleanup() {
-  echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning stale logs folder..."
-  # clean up the logs folder
-  sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true
-
+  echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning up..."
+  rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true
   echo "[${SLURM_JOB_ID}] cleanup done."
 }
 
 trap cleanup INT TERM HUP
 
-
-# Force NFS cache refresh on all nodes before running Docker to avoid stale file handle errors
+# Force NFS cache refresh on all nodes
 echo "Refreshing NFS caches on all nodes..."
 srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '
     sync
-    # Force re-stat of the mounted directory to refresh NFS handles
     ls -la '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils > /dev/null 2>&1
     stat '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils/server.sh > /dev/null 2>&1
     cat '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils/server.sh > /dev/null 2>&1
-    # Drop caches if we have permission (optional, requires root)
     echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null 2>&1 || true
     echo "NFS cache refreshed on $(hostname)"
 '
 
+# =============================================================================
+# Build engine-specific Docker environment variables
+# =============================================================================
+
+# Common env vars (always passed)
+DOCKER_ENV_COMMON=(
+    -e SLURM_JOB_ID=\$SLURM_JOB_ID
+    -e SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST
+    -e NNODES=\$NNODES
+    -e NODE_RANK=\$SLURM_PROCID
+    -e NODE0_ADDR=\$NODE0_ADDR
+    -e MODEL_DIR=/models
+    -e MODEL_NAME=\$MODEL_NAME
+    -e GPUS_PER_NODE=\$GPUS_PER_NODE
+    -e xP=\$xP
+    -e yD=\$yD
+    -e IPADDRS=\$IPADDRS
+    -e BENCH_INPUT_LEN=\$BENCH_INPUT_LEN
+    -e BENCH_OUTPUT_LEN=\$BENCH_OUTPUT_LEN
+    -e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO
+    -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER
+    -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY
+    -e TQDM_MININTERVAL=\$TQDM_MININTERVAL
+    -e DRY_RUN=\$DRY_RUN
+    -e BENCHMARK_LOGS_DIR=/benchmark_logs
+    -e ENGINE=\$ENGINE
+    -e WS_PATH=${WS_PATH}
+    -e RUN_EVAL=\$RUN_EVAL
+    -e EVAL_ONLY=\$EVAL_ONLY
+    -e EVAL_CONC=\$EVAL_CONC
+    -e FRAMEWORK=\$FRAMEWORK
+    -e PRECISION=\$PRECISION
+    -e MODEL_PREFIX=\$MODEL_PREFIX
+    -e RUNNER_TYPE=\$RUNNER_TYPE
+    -e RESULT_FILENAME=\$RESULT_FILENAME
+    -e SPEC_DECODING=\$SPEC_DECODING
+    -e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE
+    -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP
+    -e PREFILL_ENABLE_DP=\$PREFILL_ENABLE_DP
+    -e DECODE_TP_SIZE=\$DECODE_TP_SIZE
+    -e DECODE_ENABLE_EP=\$DECODE_ENABLE_EP
+    -e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP
+    -e DECODE_MTP_SIZE=\$DECODE_MTP_SIZE
+)
+
+# Engine-specific env vars
+if [[ "$ENGINE" == "vllm-disagg" ]]; then
+    DOCKER_ENV_ENGINE=(
+        -e VLLM_WS_PATH=${WS_PATH}
+        -e MODEL_PATH=$DOCKER_MODEL_PATH
+        -e UCX_TLS=tcp,self,shm,rocm_ipc,rocm_copy,cma
+        -e UCX_SOCKADDR_TLS_PRIORITY=tcp
+        -e UCX_MEMTYPE_CACHE=y
+        -e UCX_RNDV_SCHEME=get_zcopy
+        -e UCX_RNDV_THRESH=4k
+        -e UCX_ROCM_IPC_MIN_ZCOPY=0
+        -e UCX_LOG_LEVEL=warn
+        -e HSA_ENABLE_SDMA=1
+        -e PROXY_STREAM_IDLE_TIMEOUT=\${PROXY_STREAM_IDLE_TIMEOUT:-300}
+        -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-1}
+        -e PYTHONPYCACHEPREFIX=/tmp/pycache
+    )
+else
+    DOCKER_ENV_ENGINE=(
+        -e SGLANG_WS_PATH=${WS_PATH}
+    )
+fi
+
+# Engine-specific container filter for pre-clean
+CONT_FILTER="name=^container_${ENGINE}_"
+
 srun \
   --nodelist="$SELECTED_NODELIST_SRUN" \
   --kill-on-bad-exit=1 \
@@ -340,11 +435,44 @@ set -euo pipefail
 
 echo \"Rank \$SLURM_PROCID on \$(hostname)\"
 
+# Per-node docker privilege detection
+eval \"\$DOCKER_CMD_DETECT\"
+echo \"[docker-detect] rank \$SLURM_PROCID: DOCKER_CMD=\$DOCKER_CMD\"
+
 # Pre-clean (idempotent)
-sudo docker ps -aq --filter \"name=^container_sbatch_\" | xargs -r sudo docker rm -f || true
-sudo docker ps -aq | xargs -r sudo docker stop || true
+\$DOCKER_CMD ps -aq --filter \"$CONT_FILTER\" | xargs -r \$DOCKER_CMD rm -f || true
+\$DOCKER_CMD ps -aq | xargs -r \$DOCKER_CMD stop || true
+
+# Start vLLM external router container on node 0
+if [[ \"$ENGINE\" == \"vllm-disagg\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \"\$SLURM_PROCID\" == \"0\" ]]; then
+    \$DOCKER_CMD rm -f \"$ROUTER_CONT_NAME\" 2>/dev/null || true
+    \$DOCKER_CMD run -d \
+        --name \"$ROUTER_CONT_NAME\" \
+        --network host \
+        -v /tmp:/run_logs \
+        \"$VLLM_ROUTER_IMAGE\" \
+        bash -lc \"mkdir -p /run_logs/slurm_job-${SLURM_JOB_ID} && exec vllm-router \
+            --vllm-pd-disaggregation \
+            --kv-connector moriio \
+            --vllm-discovery-address 0.0.0.0:${PROXY_PING_PORT} \
+            --port ${ROUTER_PORT} \
+            --host 0.0.0.0 \
+            --policy consistent_hash \
+            --prefill-policy consistent_hash \
+            --decode-policy consistent_hash \
+            --log-level info 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/vllm_router_\$(hostname).log \"
+fi
+
+# Skip exec on vllm-disagg rank 0 so we can stop the router after the main
+# container exits.  Without this, decode nodes block forever waiting for the
+# router port to close (the router is a separate container).
+MAYBE_EXEC=exec
+if [[ \"$ENGINE\" == \"vllm-disagg\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \"\$SLURM_PROCID\" == \"0\" ]]; then
+    MAYBE_EXEC=
+    set +e
+fi
 
-exec sudo docker run --rm \
+\$MAYBE_EXEC \$DOCKER_CMD run \
     --init \
     --stop-timeout 10 \
     --device /dev/dri \
@@ -367,62 +495,38 @@ exec sudo docker run --rm \
     --cap-add SYS_PTRACE \
     --security-opt seccomp=unconfined \
     --privileged \
+    -v /sys:/sys \
+    $(command -v nicctl >/dev/null 2>&1 && echo "-v $(which nicctl):/usr/sbin/nicctl") \
     -v ${MODEL_DIR}:/models \
     -v \$HOME/.ssh:/root/.ssh \
-    -v $(which nicctl):/usr/sbin/nicctl \
     --shm-size 128G \
     -v /tmp:/run_logs \
     -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \
     -v ${DI_REPO_DIR}:${DOCKER_MOUNT_PATH} \
-    -e SLURM_JOB_ID=\$SLURM_JOB_ID \
-    -e SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST \
-    -e NNODES=\$NNODES \
-    -e NODE_RANK=\$SLURM_PROCID \
-    -e NODE0_ADDR=\$NODE0_ADDR \
-    -e MODEL_DIR=/models \
-    -e SGLANG_WS_PATH=${SGLANG_WS_PATH} \
-    -e GPUS_PER_NODE=\$GPUS_PER_NODE \
-    -e xP=\$xP \
-    -e yD=\$yD \
-    -e MODEL_NAME=\$MODEL_NAME \
-    -e IPADDRS=\$IPADDRS \
-    -e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE \
-    -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP \
-    -e PREFILL_ENABLE_DP=\$PREFILL_ENABLE_DP \
-    -e DECODE_TP_SIZE=\$DECODE_TP_SIZE \
-    -e DECODE_ENABLE_EP=\$DECODE_ENABLE_EP \
-    -e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP \
-    -e DECODE_MTP_SIZE=\$DECODE_MTP_SIZE \
-    -e BENCH_INPUT_LEN=\$BENCH_INPUT_LEN \
-    -e BENCH_OUTPUT_LEN=\$BENCH_OUTPUT_LEN \
-    -e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO \
-    -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER \
-    -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY \
-    -e TQDM_MININTERVAL=\$TQDM_MININTERVAL \
-    -e DRY_RUN=\$DRY_RUN \
-    -e BENCHMARK_LOGS_DIR=/benchmark_logs \
-    -e RUN_EVAL=\$RUN_EVAL \
-    -e EVAL_ONLY=\$EVAL_ONLY \
-    -e EVAL_CONC=\$EVAL_CONC \
-    -e FRAMEWORK=\$FRAMEWORK \
-    -e PRECISION=\$PRECISION \
-    -e MODEL_PREFIX=\$MODEL_PREFIX \
-    -e RUNNER_TYPE=\$RUNNER_TYPE \
-    -e RESULT_FILENAME=\$RESULT_FILENAME \
-    -e SPEC_DECODING=\$SPEC_DECODING \
-    -e IS_MULTINODE=\$IS_MULTINODE \
+    ${DOCKER_ENV_COMMON[*]} \
+    ${DOCKER_ENV_ENGINE[*]} \
     --name \"$DOCKER_CONT_NAME\" \
+    --entrypoint \"\" \
     \"$DOCKER_IMAGE_NAME\" bash -lc '
         set -o pipefail
         mkdir -p /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'
         '"$RUN_FILE_FULL"' 2>&1 | tee /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'/server_\$(hostname).log
     '
 
+# Only reached when exec was skipped (vllm-disagg rank 0)
 DOCKER_EXIT_CODE=\$?
-if [[ \$DOCKER_EXIT_CODE -ne 0 ]]; then
-  echo \"ERROR: docker exited rc=\$DOCKER_EXIT_CODE on \$(hostname)\"
-  exit \$DOCKER_EXIT_CODE
-fi
+echo \"[rank 0] Main container exited (rc=\$DOCKER_EXIT_CODE). Stopping vllm-router...\"
+\$DOCKER_CMD rm -f \"$ROUTER_CONT_NAME\" 2>/dev/null || true
+exit \$DOCKER_EXIT_CODE
 "
 
-srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c 'sudo docker rm -f $DOCKER_CONT_NAME 2>/dev/null || true'
+if [[ "${KEEP_CONTAINERS}" != "1" ]]; then
+    srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c 'eval "$DOCKER_CMD_DETECT"; $DOCKER_CMD rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true'
+
+    # Clean up vLLM external router container on node 0
+    if [[ "$ENGINE" == "vllm-disagg" && "$ROUTER_TYPE" == "vllm-router" ]]; then
+        srun --nodes=1 --ntasks=1 --nodelist="$MASTER_NODE" bash -c '
+            eval "$DOCKER_CMD_DETECT"; $DOCKER_CMD rm -f '"$ROUTER_CONT_NAME"' 2>/dev/null || true
+        '
+    fi
+fi
diff --git a/benchmarks/multi_node/amd_utils/models_vllm.yaml b/benchmarks/multi_node/amd_utils/models_vllm.yaml
new file mode 100644
index 000000000..c68bb46e3
--- /dev/null
+++ b/benchmarks/multi_node/amd_utils/models_vllm.yaml
@@ -0,0 +1,42 @@
+# Model-specific vLLM server configurations for disaggregated inference.
+#
+# Each top-level key is a MODEL_NAME value (must match the model identifier
+# used in amd-master.yaml and the directory/HF-cache name under MODEL_DIR).
+#
+# To add a new model: add a new top-level entry following the same schema.
+# No script changes are required.
+#
+# Schema:
+#   <model-name>:
+#     prefill_flags: str       # vLLM CLI flags for prefill workers
+#     decode_flags: str        # vLLM CLI flags for decode workers
+#     env: str                 # Space-separated KEY=VALUE pairs exported before vllm serve
+#     hf_dir: str              # (optional) On-disk directory name if it differs from the key
+#                              #   e.g. HF cache layout: models--amd--Kimi-K2.5-MXFP4
+
+Llama-3.1-405B-Instruct-FP8-KV:
+  prefill_flags: "--tensor-parallel-size 8 --kv-cache-dtype fp8"
+  decode_flags: "--tensor-parallel-size 8 --kv-cache-dtype fp8"
+  env: "VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1"
+
+amd-Llama-3.3-70B-Instruct-FP8-KV:
+  prefill_flags: "--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8"
+  decode_flags: "--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8"
+  env: "VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1"
+
+Kimi-K2.5-MXFP4:
+  prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data"
+  decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data"
+  env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=3600"
+  hf_dir: "models--amd--Kimi-K2.5-MXFP4"
+
+MiniMax-M2.5:
+  prefill_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32"
+  decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32"
+  env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_ENGINE_READY_TIMEOUT_S=3600"
+  hf_dir: "models--MiniMaxAI--MiniMax-M2.5"
+
+gpt-oss-120b:
+  prefill_flags: "--tensor-parallel-size 8"
+  decode_flags: "--tensor-parallel-size 8"
+  env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0 ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0"
diff --git a/benchmarks/multi_node/amd_utils/patches/minimax_m2.py b/benchmarks/multi_node/amd_utils/patches/minimax_m2.py
new file mode 100644
index 000000000..ac830eb1f
--- /dev/null
+++ b/benchmarks/multi_node/amd_utils/patches/minimax_m2.py
@@ -0,0 +1,672 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The MiniMax AI team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only MiniMaxM2/M2.5 model."""
+
+from collections.abc import Iterable
+from typing import Any
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm._aiter_ops import rocm_aiter_ops
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config
+from vllm.distributed import (
+    get_ep_group,
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_gather,
+)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.attention import Attention
+from vllm.model_executor.layers.fused_moe import FusedMoE, GateLinear
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.linear_attn import MiniMaxText01RMSNormTP
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.model_executor.models.utils import sequence_parallel_chunk
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
+
+logger = init_logger(__name__)
+
+
+class MiniMaxM2MoE(nn.Module):
+    """MoE layer for MiniMax M2/M2.5 with EP/WideEP/Mori support.
+
+    Follows the DeepSeek V2 MoE pattern: GateLinear + FusedMoE with
+    expert parallelism, EPLB, and sequence parallel awareness.
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        vllm_config = get_current_vllm_config()
+        parallel_config = vllm_config.parallel_config
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+
+        self.ep_group = get_ep_group().device_group
+        self.ep_rank = get_ep_group().rank_in_group
+        self.ep_size = self.ep_group.size()
+
+        self.n_routed_experts: int = config.num_local_experts
+        self.n_shared_experts: int = 0
+
+        self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe
+        self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0)
+        self.is_rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
+
+        eplb_config = parallel_config.eplb_config
+        self.enable_eplb = parallel_config.enable_eplb
+        self.n_redundant_experts = eplb_config.num_redundant_experts
+        self.n_logical_experts = self.n_routed_experts
+        self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts
+        self.n_local_physical_experts = self.n_physical_experts // self.ep_size
+
+        self.use_routing_bias = getattr(config, "use_routing_bias", False)
+        if self.use_routing_bias:
+            self.e_score_correction_bias = nn.Parameter(
+                torch.empty(config.num_local_experts, dtype=torch.float32)
+            )
+            self.e_score_correction_bias.weight_loader = (
+                MiniMaxM2MoE.ebias_weight_loader
+            )
+        else:
+            self.e_score_correction_bias = None
+
+        self.gate = GateLinear(
+            config.hidden_size,
+            config.num_local_experts,
+            out_dtype=torch.float32,
+            prefix=f"{prefix}.gate",
+        )
+
+        self.experts = FusedMoE(
+            num_experts=config.num_local_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            renormalize=True,
+            scoring_func=getattr(config, "scoring_func", "softmax"),
+            e_score_correction_bias=self.e_score_correction_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.experts",
+            enable_eplb=self.enable_eplb,
+            num_redundant_experts=self.n_redundant_experts,
+            is_sequence_parallel=self.is_sequence_parallel,
+            router_logits_dtype=torch.float32,
+            gate=self.gate,
+            routed_scaling_factor=1.0
+            if not self.is_rocm_aiter_moe_enabled
+            else self.routed_scaling_factor,
+        )
+
+    @staticmethod
+    def ebias_weight_loader(param: nn.Parameter, loaded_weight: torch.Tensor) -> None:
+        assert param.size() == loaded_weight.size()
+        param.data.copy_(loaded_weight.to(torch.float32))
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        if self.is_sequence_parallel:
+            hidden_states = sequence_parallel_chunk(hidden_states)
+
+        if self.experts.is_internal_router:
+            final_hidden_states = self.experts(
+                hidden_states=hidden_states, router_logits=hidden_states
+            )
+        else:
+            router_logits, _ = self.gate(hidden_states)
+            final_hidden_states = self.experts(
+                hidden_states=hidden_states, router_logits=router_logits
+            )
+
+        if hidden_states.dtype != torch.float16:
+            if not self.is_rocm_aiter_moe_enabled:
+                final_hidden_states = final_hidden_states * self.routed_scaling_factor
+
+        if self.is_sequence_parallel:
+            final_hidden_states = tensor_model_parallel_all_gather(
+                final_hidden_states, 0
+            )
+            final_hidden_states = final_hidden_states[:num_tokens]
+        elif self.tp_size > 1:
+            from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
+            final_hidden_states = tensor_model_parallel_all_reduce(
+                final_hidden_states
+            )
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+class MiniMaxM2Attention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rotary_dim: int,
+        rope_parameters: dict[str, Any] | None = None,
+        attn_window_size: int | None = None,
+        max_position_embeddings: int = 8192,
+        head_dim: int | None = None,
+        rms_norm_eps: float = 1e-06,
+        qkv_bias: bool = False,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim or (hidden_size // self.total_num_heads)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=qkv_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        if (
+            rope_parameters is not None
+            and "partial_rotary_factor" not in rope_parameters
+        ):
+            rope_parameters["partial_rotary_factor"] = rotary_dim / self.head_dim
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            max_position=max_position_embeddings,
+            rope_parameters=rope_parameters,
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            per_layer_sliding_window=attn_window_size,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+
+        self.q_norm = MiniMaxText01RMSNormTP(
+            self.head_dim * self.total_num_heads, eps=rms_norm_eps
+        )
+        self.k_norm = MiniMaxText01RMSNormTP(
+            self.head_dim * self.total_num_kv_heads, eps=rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = MiniMaxText01RMSNormTP.forward_qk(
+            self.q_norm, self.k_norm, q.contiguous(), k.contiguous()
+        )
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class MiniMaxM2DecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        prefix: str,
+        model_config: ModelConfig,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int):
+            max_position_embeddings = max(
+                config.max_position_embeddings, config.max_model_len
+            )
+        # DecoderLayers are created with `make_layers` which passes the prefix
+        # with the layer's index.
+        layer_idx = int(prefix.split(sep=".")[-1])
+
+        self.layer_idx = layer_idx
+        self.self_attn = MiniMaxM2Attention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rotary_dim=config.rotary_dim,
+            rope_parameters=config.rope_parameters,
+            max_position_embeddings=max_position_embeddings,
+            rms_norm_eps=config.rms_norm_eps,
+            qkv_bias=getattr(config, "attention_bias", False),
+            head_dim=getattr(config, "head_dim", None),
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+
+        self.block_sparse_moe = MiniMaxM2MoE(
+            config=config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+    ) -> torch.Tensor:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+
+        hidden_states = self.block_sparse_moe(hidden_states)
+
+        return hidden_states, residual
+
+
+@support_torch_compile
+class MiniMaxM2Model(nn.Module):
+    fall_back_to_pt_during_load = False
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+
+        self.vocab_size = config.vocab_size
+
+        if get_pp_group().is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=None,
+                prefix=f"{prefix}.embed_tokens",
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: MiniMaxM2DecoderLayer(
+                config,
+                prefix,
+                model_config=model_config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+            ),
+            prefix=f"{prefix}.layers",
+        )
+
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_input_ids(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for layer in self.layers[self.start_layer : self.end_layer]:
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return FusedMoE.make_expert_params_mapping(
+            self,
+            ckpt_gate_proj_name="w1",
+            ckpt_down_proj_name="w2",
+            ckpt_up_proj_name="w3",
+            num_experts=self.config.num_local_experts,
+            num_redundant_experts=0,
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = self.get_expert_mapping()
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
+            if spec_layer is not None:
+                continue  # skip spec decode layers for main model
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if ("mlp.experts." in name) and name not in params_dict:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class MiniMaxM2MixtureOfExperts(MixtureOfExperts):
+    """EPLB protocol implementation for MiniMax M2/M2.5."""
+
+    moe_mlp_layers: list[MiniMaxM2MoE]
+
+    def extract_moe_parameters(self, example_moe: MiniMaxM2MoE | None):
+        if example_moe is None:
+            self.num_moe_layers = 0
+            self.num_expert_groups = 0
+            self.num_logical_experts = 0
+            self.num_physical_experts = 0
+            self.num_local_physical_experts = 0
+            self.num_routed_experts = 0
+            self.num_shared_experts = 0
+            self.num_redundant_experts = 0
+            logger.warning("MiniMax M2: No MoE layer found in model.layers.")
+        else:
+            self.num_logical_experts = example_moe.n_logical_experts
+            self.num_physical_experts = example_moe.n_physical_experts
+            self.num_local_physical_experts = example_moe.n_local_physical_experts
+            self.num_routed_experts = example_moe.n_routed_experts
+            self.num_shared_experts = example_moe.n_shared_experts
+            self.num_redundant_experts = example_moe.n_redundant_experts
+
+    def update_physical_experts_metadata(
+        self,
+        num_physical_experts: int,
+        num_local_physical_experts: int,
+    ) -> None:
+        assert self.num_local_physical_experts == num_local_physical_experts
+        self.num_physical_experts = num_physical_experts
+        self.num_local_physical_experts = num_local_physical_experts
+        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
+        for moe in self.moe_mlp_layers:
+            moe.n_local_physical_experts = num_local_physical_experts
+            moe.n_physical_experts = num_physical_experts
+            moe.n_redundant_experts = self.num_redundant_experts
+            moe.experts.update_expert_map()
+
+
+class MiniMaxM2ForCausalLM(
+    nn.Module, SupportsLoRA, SupportsPP, MiniMaxM2MixtureOfExperts
+):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        if hasattr(vllm_config.model_config, "max_model_len"):
+            self.config.max_model_len = vllm_config.model_config.max_model_len
+        self.model = MiniMaxM2Model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size, config.hidden_size, quant_config=None
+            )
+        else:
+            self.lm_head = PPMissingLayer()
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+
+        self.num_moe_layers = config.num_hidden_layers
+        self._set_moe_parameters()
+
+    def _set_moe_parameters(self):
+        self.expert_weights: list = []
+        self.num_expert_groups = 1
+        self.moe_layers: list = []
+        self.moe_mlp_layers: list[MiniMaxM2MoE] = []
+        example_moe = None
+        for layer in self.model.layers:
+            if isinstance(layer, PPMissingLayer):
+                continue
+            assert isinstance(layer, MiniMaxM2DecoderLayer)
+            if isinstance(layer.block_sparse_moe, MiniMaxM2MoE):
+                example_moe = layer.block_sparse_moe
+                self.moe_mlp_layers.append(layer.block_sparse_moe)
+                self.moe_layers.append(layer.block_sparse_moe.experts)
+        self.extract_moe_parameters(example_moe)
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return self.model.get_expert_mapping()
+
+
+def get_spec_layer_idx_from_weight_name(
+    config: PretrainedConfig, weight_name: str
+) -> int | None:
+    if hasattr(config, "num_mtp_modules") and (config.num_mtp_modules > 0):
+        layer_idx = config.num_hidden_layers
+        for i in range(config.num_mtp_modules):
+            if weight_name.startswith(f"model.layers.{layer_idx + i}."):
+                return layer_idx + i
+    return None
diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index bbe8de6aa..5c441a793 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -1,761 +1,19 @@
 #!/bin/bash
-# SGLang Disaggregated Server Launcher with Model-Specific Configurations
+# Dual-Engine Disaggregated Server Dispatcher
 # =============================================================================
-
-# =============================================================================
-# Environment Configuration
-# =============================================================================
-
-NODE0_ADDR="${NODE0_ADDR:-localhost}"
-NODE_RANK="${NODE_RANK:-0}"
-MODEL_DIR="${MODEL_DIR:-}"
-MODEL_NAME="${MODEL_NAME:-}"
-
-xP="${xP:-1}" #-> Number of Prefill Workers
-yD="${yD:-1}" #-> Number of Decode Workers
-
-IPADDRS="${IPADDRS:-localhost}"
-HEADNODE_PORT="${HEADNODE_PORT:-20000}"
-# Parallelism Configuration
-PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}"
-PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-true}"
-PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-true}"
-DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}"
-DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-true}"
-DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-true}"
-DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-0}"
-
-# Benchmark Configuration
-BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}"
-BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}"
-BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}"
-BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}"
-BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}"
-BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}"
-
-# Dry Run for debugging purpose
-DRY_RUN="${DRY_RUN:-0}"
-
-# GPU count (expandable for different hardware)
-GPUS_PER_NODE="${GPUS_PER_NODE:-8}"
-
-
-# =============================================================================
-# Dependencies and Environment Setup
-# =============================================================================
-source $SGLANG_WS_PATH/env.sh
-
-host_ip=$(ip route get 1.1.1.1 | awk '/src/ {print $7}')
-host_name=$(hostname)
-
-# MORI_RDMA_TC configuration (optional)
-# If set by runner, use it for RDMA traffic class configuration
-# If not set, RDMA operations will proceed without QoS/traffic class settings
-if [[ -n "${MORI_RDMA_TC}" ]]; then
-    echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC for RDMA traffic class configuration"
-    echo "[INFO] Host '$host_name' configured with MORI_RDMA_TC=$MORI_RDMA_TC"
-else
-    echo "[INFO] MORI_RDMA_TC not set. Skipping RDMA traffic class configuration."
-    echo "[INFO] This is normal for clusters without QoS requirements."
-fi
-
-# =============================================================================
-# Model-Specific Configuration from YAML
-# =============================================================================
-MODELS_YAML="${SGLANG_WS_PATH}/models.yaml"
-
-if [[ ! -f "$MODELS_YAML" ]]; then
-    echo "ERROR: models.yaml not found at $MODELS_YAML"
-    exit 1
-fi
-
-# Load model config via inline Python (PyYAML is available in SGLang containers)
-# Formula evaluation (e.g. "SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK * TP * xP")
-# is done here in Python to avoid bash glob-expanding the * characters.
-eval "$(python3 -c "
-import yaml, sys, os
-
-config_path = '${MODELS_YAML}'
-model_name = '${MODEL_NAME}'
-
-with open(config_path) as f:
-    models = yaml.safe_load(f)
-
-if model_name not in models:
-    print(f'echo \"ERROR: Model {model_name} not in models.yaml\"; exit 1')
-    sys.exit(0)
-
-m = models[model_name]
-
-def eval_formula(val):
-    \"\"\"Evaluate chunked_prefill_size: if string, resolve variable names from env and compute.\"\"\"
-    if isinstance(val, (int, float)):
-        return int(val)
-    s = str(val)
-    # Build a namespace from env vars (convert numeric values to int)
-    ns = {}
-    for k, v in os.environ.items():
-        try:
-            ns[k] = int(v)
-        except (ValueError, TypeError):
-            pass
-    try:
-        return int(eval(s, {'__builtins__': {}}, ns))
-    except Exception as e:
-        print(f'echo \"WARNING: Cannot evaluate formula: {s} ({e})\"', file=sys.stderr)
-        return val
-
-def parse_range(cuda_range, default_start, default_end):
-    if '-' in str(cuda_range):
-        s, e = str(cuda_range).split('-')
-        return s, e
-    return str(default_start), str(default_end)
-
-# Output shell variables
-print(f'MODEL_BASE_FLAGS=\"{m.get(\"base_flags\", \"\")}\"')
-print(f'MODEL_MTP_FLAGS=\"{m.get(\"mtp_flags\", \"\")}\"')
-print(f'MODEL_DP_FLAGS=\"{m.get(\"dp_flags\", \"\")}\"')
-
-prefill = m.get('prefill', {})
-decode = m.get('decode', {})
-
-print(f'PREFILL_MEM_FRACTION_STATIC=\"{prefill.get(\"mem_fraction_static\", 0.8)}\"')
-print(f'PREFILL_DISABLE_RADIX_CACHE=\"{prefill.get(\"disable_radix_cache\", True)}\"')
-
-dp = prefill.get('dp', {})
-no_dp = prefill.get('no_dp', {})
-print(f'PREFILL_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 24)}\"')
-print(f'PREFILL_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"')
-print(f'PREFILL_CUDA_GRAPH_BS_DP=\"{dp.get(\"cuda_graph_bs\", \"1 2 3\")}\"')
-print(f'PREFILL_CONTEXT_LENGTH_DP=\"{dp.get(\"context_length\", \"\")}\"')
-print(f'PREFILL_MAX_TOTAL_TOKENS_DP=\"{dp.get(\"max_total_tokens\", \"\")}\"')
-print(f'PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP=\"{dp.get(\"enable_two_batch_overlap\", False)}\"')
-print(f'PREFILL_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"')
-print(f'PREFILL_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"')
-s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128)
-print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"')
-print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"')
-
-print(f'DECODE_MEM_FRACTION_STATIC=\"{decode.get(\"mem_fraction_static\", 0.85)}\"')
-print(f'DECODE_PREFILL_ROUND_ROBIN_BALANCE=\"{decode.get(\"prefill_round_robin_balance\", True)}\"')
-
-dp = decode.get('dp', {})
-ep_only = decode.get('ep_only', {})
-no_dp = decode.get('no_dp', {})
-
-# Decode DP config
-print(f'DECODE_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 4096)}\"')
-print(f'DECODE_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"')
-s, e = parse_range(dp.get('cuda_graph_bs_range', '1-160'), 1, 160)
-print(f'DECODE_CUDA_GRAPH_BS_DP_START=\"{s}\"')
-print(f'DECODE_CUDA_GRAPH_BS_DP_END=\"{e}\"')
-
-# Decode EP-only config (EP enabled but DP disabled)
-print(f'DECODE_MAX_RUNNING_REQUESTS_EP_ONLY=\"{ep_only.get(\"max_running_requests\", 256)}\"')
-print(f'DECODE_CHUNKED_PREFILL_SIZE_EP_ONLY=\"{eval_formula(ep_only.get(\"chunked_prefill_size\", 262144))}\"')
-s, e = parse_range(ep_only.get('cuda_graph_bs_range', '1-256'), 1, 256)
-print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_START=\"{s}\"')
-print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_END=\"{e}\"')
-
-# Decode no-DP config
-print(f'DECODE_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"')
-print(f'DECODE_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"')
-s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128)
-print(f'DECODE_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"')
-print(f'DECODE_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"')
-")"
-
-echo "Loaded model configuration for: $MODEL_NAME"
-
-# Compute DP-dependent prefill parameters
-if [[ "$PREFILL_ENABLE_DP" == "true" ]]; then
-    prefill_cuda_graph_bs=($PREFILL_CUDA_GRAPH_BS_DP)
-    prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_DP
-    prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_DP
-    prefill_context_length=$PREFILL_CONTEXT_LENGTH_DP
-    prefill_max_total_tokens=$PREFILL_MAX_TOTAL_TOKENS_DP
-    prefill_enable_two_batch_overlap=$PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP
-else
-    prefill_cuda_graph_bs=($(seq $PREFILL_CUDA_GRAPH_BS_NO_DP_START $PREFILL_CUDA_GRAPH_BS_NO_DP_END))
-    prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_NO_DP
-    prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_NO_DP
-    prefill_context_length=""
-    prefill_max_total_tokens=""
-    prefill_enable_two_batch_overlap="false"
-fi
-
-# Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp)
-if [[ "$DECODE_ENABLE_DP" == "true" ]]; then
-    decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_DP_START $DECODE_CUDA_GRAPH_BS_DP_END))
-    decode_max_running_requests=$((DECODE_CUDA_GRAPH_BS_DP_END * DECODE_TP_SIZE))
-elif [[ "$DECODE_ENABLE_EP" == "true" ]]; then
-    decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_EP_ONLY_START $DECODE_CUDA_GRAPH_BS_EP_ONLY_END))
-    decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_EP_ONLY
-else
-    decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_NO_DP_START $DECODE_CUDA_GRAPH_BS_NO_DP_END))
-    decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP
-fi
-
-# Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS)
-PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} "
-if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then
-    PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache"
-fi
-if [[ -n "$prefill_context_length" ]]; then
-    PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --context-length ${prefill_context_length}"
-fi
-if [[ -n "$prefill_max_total_tokens" ]]; then
-    PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --max-total-tokens ${prefill_max_total_tokens}"
-fi
-if [[ "$prefill_enable_two_batch_overlap" == "True" ]] || [[ "$prefill_enable_two_batch_overlap" == "true" ]]; then
-    PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --enable-two-batch-overlap"
-    PREFILL_SDMA_ENV="MORI_ENABLE_SDMA=true"
-fi
-
-DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} "
-
-if [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "True" ]] || [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "true" ]]; then
-    DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --prefill-round-robin-balance"
-fi
-
-if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then
-    MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1)))
-    MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MOE_MAX_INPUT_TOKENS_DECODE * (DECODE_MTP_SIZE + 1)))
-fi
-
-# =============================================================================
-# Cluster Topology Configuration
-# =============================================================================
-IFS=',' read -ra IP_ARRAY <<< "$IPADDRS"
-
-# Ceiling division by GPUS_PER_NODE for nodes-per-worker
-PREFILL_NODES_PER_WORKER=$(((PREFILL_TP_SIZE + 7) / GPUS_PER_NODE))
-DECODE_NODES_PER_WORKER=$(((DECODE_TP_SIZE + 7) / GPUS_PER_NODE))
-NODE_OFFSET=$((PREFILL_NODES_PER_WORKER * xP))
-
-# Build prefill arguments dynamically based on xP
-PREFILL_HEADNODE_URLS=()
-PREFILL_ARGS=""
-for i in $(seq 0 $((xP - 1))); do
-    prefill_idx=$((i * PREFILL_NODES_PER_WORKER))
-    PREFILL_HEADNODE_URLS[$i]="${IP_ARRAY[$prefill_idx]}:${HEADNODE_PORT}"
-    PREFILL_ARGS="$PREFILL_ARGS --prefill http://${IP_ARRAY[$prefill_idx]}:8000"
-done
-
-# Build decode arguments dynamically based on yD
-DECODE_HEADNODE_URLS=()
-DECODE_ARGS=""
-for i in $(seq 0 $((yD - 1))); do
-    decode_idx=$((i * DECODE_NODES_PER_WORKER + NODE_OFFSET))
-    DECODE_HEADNODE_URLS[$i]="${IP_ARRAY[$decode_idx]}:${HEADNODE_PORT}"
-    DECODE_ARGS="$DECODE_ARGS --decode http://${IP_ARRAY[$decode_idx]}:8000"
-done
-
-echo "Prefill worker headnode list: ${PREFILL_HEADNODE_URLS[@]}"
-echo "Decode  worker headnode list: ${DECODE_HEADNODE_URLS[@]}"
-
-# =============================================================================
-# Configuration Builder Functions
-# =============================================================================
-
-build_server_config() {
-    local mode="$1"
-    local model_name="$2"
-    local tp_size="$3"
-    local enable_ep="$4"
-    local enable_dp="$5"
-    local decode_mtp_size="$6"
-
-    # Calculate EP and DP sizes based on enable flags
-    local ep_size=1
-    local dp_size=1
-
-    if [[ "$enable_ep" == "true" ]]; then
-        ep_size=$tp_size
-    fi
-
-    if [[ "$enable_dp" == "true" ]]; then
-        dp_size=$tp_size
-    fi
-
-    # Build parallelism arguments
-    local parallel_args="--tp-size ${tp_size}"
-
-    if [[ "$enable_ep" == "true" ]]; then
-        parallel_args="$parallel_args --ep-size ${ep_size}"
-    fi
-
-    if [[ "$enable_dp" == "true" ]]; then
-        parallel_args="$parallel_args --dp-size ${dp_size}"
-    fi
-
-    # Get model-specific configuration from YAML-loaded variables
-    local base_config="$MODEL_BASE_FLAGS"
-    local mtp_config=""
-    local dp_config=""
-    local specific_config=""
-
-    # MTP config (only if MTP is enabled and mode is decode)
-    if [ "$decode_mtp_size" -gt 0 ]; then
-        mtp_config="${MODEL_MTP_FLAGS} --speculative-num-steps ${decode_mtp_size} --speculative-num-draft-tokens $((decode_mtp_size + 1))"
-    fi
-
-    # DP config (only if DP is enabled)
-    if [[ "$enable_dp" == "true" ]]; then
-        dp_config="$MODEL_DP_FLAGS"
-    fi
-
-    # Mode-specific config
-    if [[ "$mode" == "prefill" ]]; then
-        specific_config="$PREFILL_MODE_FLAGS"
-    elif [[ "$mode" == "decode" ]]; then
-        specific_config="$DECODE_MODE_FLAGS"
-    fi
-
-    # Combine: parallel args + base config + mtp config (decode only) + dp config + specific config
-    local full_config="$parallel_args"
-    if [[ -n "$base_config" ]]; then
-        full_config="$full_config $base_config"
-    fi
-    if [[ -n "$mtp_config" ]] && [[ "$mode" == "decode" ]]; then
-        full_config="$full_config $mtp_config"
-    fi
-    if [[ -n "$dp_config" ]]; then
-        full_config="$full_config $dp_config"
-    fi
-    if [[ -n "$specific_config" ]]; then
-        full_config="$full_config $specific_config"
-    fi
-
-    echo "$full_config"
-}
-
-# Build complete server configurations
-PREFILL_SERVER_CONFIG=$(build_server_config "prefill" "$MODEL_NAME" "$PREFILL_TP_SIZE" "$PREFILL_ENABLE_EP" "$PREFILL_ENABLE_DP" "$DECODE_MTP_SIZE")
-DECODE_SERVER_CONFIG=$(build_server_config "decode" "$MODEL_NAME" "$DECODE_TP_SIZE" "$DECODE_ENABLE_EP" "$DECODE_ENABLE_DP" "$DECODE_MTP_SIZE")
-
-if [[ -n "$MODEL_NAME" ]]; then
-    echo "Using model-specific configuration for: $MODEL_NAME"
-fi
-
-if [[ "${EVAL_ONLY:-false}" == "true" ]] || [[ "${RUN_EVAL:-false}" == "true" ]]; then
-    PREFILL_SERVER_CONFIG=$(echo "$PREFILL_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g')
-    DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g')
-    unset MORI_MOE_MAX_INPUT_TOKENS_PREFILL
-    unset MORI_MOE_MAX_INPUT_TOKENS_DECODE
-    # NOTE: that currently with fp8_combine set, the evals do not pass on InferenceX eval harness
-    # or on SGLang native harness for high concurrency 4k and gets no where near the golden score of
-    # 0.95 on even basic GSM8k grade school math as confirmed by @billishyahao from AMD
-    # and as confirmed by @Oseltamivir. This was initally merged with @billishyahao promising 
-    # that an fast follow PR to fix the evals via having quant correction in the fp8 combine
-fi
-
+# Dispatches to the engine-specific server launcher based on ENGINE env var.
+#   ENGINE=sglang-disagg (default) -> server_sglang.sh (SGLang + MoRI)
+#   ENGINE=vllm-disagg             -> server_vllm.sh  (vLLM + Nixl/MoRI-IO)
 # =============================================================================
-# Container Synchronization
-# =============================================================================
-
-echo "Waiting at the container creation barrier on $host_name"
-python3 $SGLANG_WS_PATH/sync.py barrier \
-    --local-ip ${host_ip} \
-    --local-port 5000 \
-    --enable-port \
-    --node-ips ${IPADDRS} \
-    --node-ports 5000 \
-    --wait-for-all-ports \
-    --timeout 300
-
-
-# =============================================================================
-# Node Role Assignment and Server Launch
-# =============================================================================
-
-if [ "$NODE_RANK" -eq 0 ]; then
-    echo "NODE INFO ======================================="
-    echo "================================================"
-    echo "Node List : ${SLURM_JOB_NODELIST}"
-    echo "Node IPs : ${IPADDRS}"
-    echo "Model Name : ${MODEL_NAME:-'Not specified'}"
-    echo "================================================"
-
-    echo "CLUSTER INFO ===================================="
-    echo "================================================"
-    echo "${host_name}:${host_ip} is Proxy Node and Prefill Node"
-    echo "Using prefill config: $PREFILL_SERVER_CONFIG"
-    echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}"
-    echo "Decode  parallelism: TP=${DECODE_TP_SIZE},  EP enabled: ${DECODE_ENABLE_EP},  DP enabled: ${DECODE_ENABLE_DP},  MTP size=${DECODE_MTP_SIZE}"
-    echo "Prefill servers ($((PREFILL_TP_SIZE/GPUS_PER_NODE)) nodes): ${PREFILL_ARGS}"
-    echo "Decode servers  ($((DECODE_TP_SIZE/GPUS_PER_NODE))  nodes): ${DECODE_ARGS}"
-    echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL}"
-    echo "Decode  env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} "
-    echo "Decode  env: SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE} "
-
-    echo "================================================"
-
-    # start the head prefill server
-    PREFILL_MORI_MOE_ENV=""
-    set -x
-    if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" ]]; then
-        PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}"
-    fi
-    set +x
-    PREFILL_CMD="${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
-        --model-path $MODEL_DIR/$MODEL_NAME \
-        --disaggregation-mode prefill \
-        --disaggregation-ib-device ${IBDEVICES} \
-        --host 0.0.0.0 \
-        --port 8000 \
-        --trust-remote-code \
-        ${PREFILL_SERVER_CONFIG} "
-
-    if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then
-        PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[0]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank 0"
-    fi
-
-
-    if [[ "$DRY_RUN" -eq 1 ]]; then
-        echo "DRY RUN: $PREFILL_CMD"
-    else
-        set -x
-        eval "$PREFILL_CMD" \
-            2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log &
-        set +x
-        prefill0_pid=$!
-    fi
-
-
-    echo "Waiting for all prefill and decode servers to be up . . ."
-
-
-    BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \
-        --node-ips ${IPADDRS} \
-        --node-ports 8000 \
-        --wait-for-all-ports \
-        --timeout 1800"
-
-    if [[ "$DRY_RUN" -eq 1 ]]; then
-        echo "DRY RUN: $BARRIER_CMD"
-    else
-        eval "$BARRIER_CMD"
-    fi
-    echo "Congratulations!!! All prefill and decode servers are up . . ."
-
-    ROUTER_CMD="python -m sglang_router.launch_router \
-        --pd-disaggregation \
-        --port 30000 \
-        --policy random \
-        --prefill-policy random \
-        --decode-policy random \
-        ${PREFILL_ARGS} \
-        ${DECODE_ARGS}"
-
-
-    if [[ "$DRY_RUN" -eq 1 ]]; then
-        echo "DRY RUN: $ROUTER_CMD"
-    else
-        ROUTER_LOG_FILE="/tmp/slurm_job-${SLURM_JOB_ID}_proxy_${host_name}.log"
-        set -x
-        if [[ "${SGLANG_ROUTER_STDOUT_LOGS:-0}" == "1" ]]; then
-            eval "$ROUTER_CMD" 2>&1 | tee "$ROUTER_LOG_FILE" &
-        else
-            eval "$ROUTER_CMD" >"$ROUTER_LOG_FILE" 2>&1 &
-        fi
-        set +x
-        proxy_pid=$!
-
-        # Wait for router to be ready via health endpoint
-        HEALTH_BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \
-            --node-ips ${NODE0_ADDR} \
-            --node-ports 30000 \
-            --wait-for-all-health \
-            --health-endpoint /readiness \
-            --timeout 1800"
-
-        if [[ "$DRY_RUN" -eq 1 ]]; then
-            echo "DRY RUN: $HEALTH_BARRIER_CMD"
-        else
-            eval "$HEALTH_BARRIER_CMD"
-        fi
-
-        echo "Router is ready for benchmarking"
-    fi
-
-
-    echo "Ready for benchmarking on ${host_name}:${host_ip}"
-
-    echo "Benchmarking on ${host_name}:${host_ip}"
-    cd $SGLANG_WS_PATH
-
-    # Export IS_MTP based on whether MTP is enabled
-    if [ "$DECODE_MTP_SIZE" -gt 0 ]; then
-        export IS_MTP=true
-    else
-        export IS_MTP=false
-    fi
-
-    # n_prefill n_decode prefill_gpus decode_gpus model_dir model_name log_path isl osl concurrency_list req_rate random_range_ratio num_prompts_multiplier
-    BENCH_CMD="bash $SGLANG_WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \
-        $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \
-        ${BENCH_OUTPUT_LEN} "${BENCH_MAX_CONCURRENCY}" ${BENCH_REQUEST_RATE} \
-        ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}"
-
-    if [[ "${EVAL_ONLY:-false}" == "true" ]]; then
-        echo "EVAL_ONLY mode: skipping throughput benchmark"
-    elif [[ "$DRY_RUN" -eq 1 ]]; then
-        echo "DRY RUN: $BENCH_CMD"
-    else
-        set -x
-        eval "$BENCH_CMD"
-        set +x
-    fi
 
-    # Run evaluation if requested (before killing router)
-    if [[ "${RUN_EVAL:-false}" == "true" ]]; then
-        echo "Running lm-eval evaluation on Node 0..."
+ENGINE="${ENGINE:-sglang-disagg}"
+WS_PATH="${WS_PATH:-${SGLANG_WS_PATH:-${VLLM_WS_PATH:-$(dirname "${BASH_SOURCE[0]}")}}}"
+export WS_PATH ENGINE
 
-        # Health check: verify the router is still serving before running eval.
-        # The throughput benchmark may have crashed/exhausted decode workers.
-        EVAL_HEALTH_OK=false
-        for _attempt in 1 2 3; do
-            if curl -sf --max-time 10 "http://0.0.0.0:30000/readiness" >/dev/null 2>&1; then
-                EVAL_HEALTH_OK=true
-                break
-            fi
-            echo "Eval health check attempt $_attempt failed, retrying in 10s..."
-            sleep 10
-        done
-
-        if [[ "$EVAL_HEALTH_OK" != "true" ]]; then
-            echo "WARNING: Router health check failed after 3 attempts. Skipping eval."
-        else
-            # Must run from repo root so utils/evals/${task}.yaml resolves
-            pushd /workspace
-
-            # Source eval functions from benchmark_lib.sh
-            source /workspace/benchmarks/benchmark_lib.sh
-
-            # Use EVAL_CONC from workflow if set, otherwise fall back to max of conc list
-            if [[ -n "${EVAL_CONC:-}" ]]; then
-                export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC}"
-            else
-                export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1)
-            fi
-
-            # Override eval context length with model's configured context_length
-            if [[ -n "$prefill_context_length" ]]; then
-                export EVAL_MAX_MODEL_LEN="$prefill_context_length"
-            fi
-
-            if [[ "$DRY_RUN" -eq 1 ]]; then
-                echo "DRY RUN: run_eval --framework lm-eval --port 30000 (conc=${EVAL_CONCURRENT_REQUESTS}, ctx=${EVAL_MAX_MODEL_LEN:-auto})"
-            else
-                # Run lm-eval against the router on port 30000
-                run_eval --framework lm-eval --port 30000
-                eval_rc=$?
-
-                if [[ $eval_rc -ne 0 ]]; then
-                    echo "ERROR: run_eval exited rc=$eval_rc; skipping metadata write and eval artifact staging" >&2
-                    EVAL_FAILED=1
-                else
-                    # Set metadata env vars for append_lm_eval_summary
-                    export TP="${PREFILL_TP_SIZE}"
-                    export CONC="${EVAL_CONCURRENT_REQUESTS}"
-                    export EP_SIZE=1
-                    [[ "${PREFILL_ENABLE_EP}" == "true" ]] && EP_SIZE="${PREFILL_TP_SIZE}"
-                    export PREFILL_TP="${PREFILL_TP_SIZE}"
-                    export PREFILL_EP=1
-                    [[ "${PREFILL_ENABLE_EP}" == "true" ]] && PREFILL_EP="${PREFILL_TP_SIZE}"
-                    export PREFILL_NUM_WORKERS="${xP}"
-                    export DECODE_TP="${DECODE_TP_SIZE}"
-                    export DECODE_EP=1
-                    [[ "${DECODE_ENABLE_EP}" == "true" ]] && DECODE_EP="${DECODE_TP_SIZE}"
-                    export DECODE_NUM_WORKERS="${yD}"
-                    export DP_ATTENTION="${PREFILL_ENABLE_DP}"
-                    export PREFILL_DP_ATTENTION="${PREFILL_ENABLE_DP}"
-                    export DECODE_DP_ATTENTION="${DECODE_ENABLE_DP}"
-                    export ISL="${BENCH_INPUT_LEN}"
-                    export OSL="${BENCH_OUTPUT_LEN}"
-                    # IS_MULTINODE, FRAMEWORK, PRECISION, MODEL_PREFIX, RUNNER_TYPE,
-                    # RESULT_FILENAME are already set via Docker -e flags from job.slurm
-
-                    append_lm_eval_summary
-                    # Files (meta_env.json, results*.json, sample*.jsonl) are now in /workspace
-
-                    # Copy eval artifacts to run_logs for NFS extraction by runner
-                    EVAL_COPY_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/eval_results"
-                    mkdir -p "$EVAL_COPY_DIR"
-                    for f in meta_env.json; do
-                        [ -e "/workspace/$f" ] && cp -f "/workspace/$f" "$EVAL_COPY_DIR/"
-                    done
-                    # Use find for glob patterns to avoid "no match" errors
-                    find /workspace -maxdepth 1 -name 'results*.json' -exec cp -f {} "$EVAL_COPY_DIR/" \;
-                    find /workspace -maxdepth 1 -name 'sample*.jsonl' -exec cp -f {} "$EVAL_COPY_DIR/" \;
-
-                    echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR"
-                fi
-            fi
-
-            popd
-        fi
-    fi
-
-    # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host)
-    LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs"
-    mkdir -p "$LOGS_OUTPUT"
-
-    if [[ "$DRY_RUN" -eq 0 ]]; then
-        cp -r /run_logs/slurm_job-${SLURM_JOB_ID} "$LOGS_OUTPUT/"
-        echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}"
-    fi
-
-    echo "Killing the proxy server and prefill server"
-
-    if [[ "$DRY_RUN" -eq 0 ]]; then
-        kill $proxy_pid
-        kill $prefill0_pid
-    fi
-
-    if [[ "${EVAL_FAILED:-0}" -eq 1 ]]; then
-        echo "ERROR: eval failed; exiting node-0 with rc=1"
-        exit 1
-    fi
-
-elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then
-    echo "${host_name}:${host_ip} is Prefill Node (Model: ${MODEL_NAME:-'default'})"
-    echo "Using prefill config: $PREFILL_SERVER_CONFIG"
-    echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}"
-
-    PREFILL_MORI_MOE_ENV=""
-    set -x
-    if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" ]]; then
-        PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}"
-    fi
-    set +x
-    PREFILL_CMD="${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
-        --model-path $MODEL_DIR/${MODEL_NAME} \
-        --disaggregation-mode prefill \
-        --disaggregation-ib-device ${IBDEVICES} \
-        --host 0.0.0.0 \
-        --port 8000 \
-        --trust-remote-code \
-        ${PREFILL_SERVER_CONFIG} "
-
-    if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then
-        rank=$((NODE_RANK % PREFILL_NODES_PER_WORKER))
-        prefill_idx=$((NODE_RANK / PREFILL_NODES_PER_WORKER))
-        PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[$prefill_idx]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank $rank"
-    fi
-
-    if [[ "$DRY_RUN" -eq 1 ]]; then
-        echo "DRY RUN: $PREFILL_CMD"
-    else
-        set -x
-        eval "$PREFILL_CMD" \
-            2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log &
-        set +x
-        prefill_pid=$!
-    fi
-
-    echo "Waiting for proxy server to be up..."
-    BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \
-        --node-ips ${NODE0_ADDR} \
-        --node-ports 30000 \
-        --wait-for-all-ports \
-        --timeout 1800"
-
-    if [[ "$DRY_RUN" -eq 1 ]]; then
-        echo "DRY RUN: $BARRIER_CMD"
-    else
-        eval "$BARRIER_CMD"
-    fi
-
-    echo "Waiting until proxy server closes..."
-    WAIT_CMD="python3 $SGLANG_WS_PATH/sync.py wait \
-        --remote-ip ${NODE0_ADDR} \
-        --remote-port 30000"
-
-    if [[ "$DRY_RUN" -eq 1 ]]; then
-        echo "DRY RUN: $WAIT_CMD"
-    else
-        eval "$WAIT_CMD"
-    fi
-
-    echo "Killing the rank $NODE_RANK prefill server"
-
-    if [[ "$DRY_RUN" -eq 0 ]]; then
-        kill $prefill_pid
-    fi
+echo "[DISPATCHER] ENGINE=$ENGINE  WS_PATH=$WS_PATH"
 
+if [[ "$ENGINE" == "vllm-disagg" ]]; then
+    source "$WS_PATH/server_vllm.sh"
 else
-    RANK=$((NODE_RANK - xP * PREFILL_NODES_PER_WORKER))
-    echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME:-'default'})"
-    echo "Using decode config: $DECODE_SERVER_CONFIG"
-    echo "Decode node rank: $RANK"
-    echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}"
-
-    DECODE_MORI_MOE_ENV=""
-    set -x
-    if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_DECODE" ]]; then
-        DECODE_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE}"
-    fi
-    set +x
-    DECODE_CMD="${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \
-        --model-path ${MODEL_DIR}/${MODEL_NAME} \
-        --disaggregation-mode decode \
-        --disaggregation-ib-device ${IBDEVICES} \
-        --host 0.0.0.0 \
-        --port 8000 \
-        --trust-remote-code \
-        ${DECODE_SERVER_CONFIG} "
-
-    if [ "$DECODE_NODES_PER_WORKER" -gt 1 ]; then
-        rank=$((RANK % DECODE_NODES_PER_WORKER))
-        decode_idx=$((RANK / DECODE_NODES_PER_WORKER))
-        DECODE_CMD="$DECODE_CMD --dist-init-addr ${DECODE_HEADNODE_URLS[$decode_idx]} --nnodes ${DECODE_NODES_PER_WORKER} --node-rank $rank"
-    fi
-
-    if [[ "$DRY_RUN" -eq 1 ]]; then
-        echo "DRY RUN: $DECODE_CMD"
-    else
-        set -x
-        eval "$DECODE_CMD" \
-            2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log &
-
-        set +x
-        decode_pid=$!
-    fi
-
-
-    echo "Waiting for proxy server to be up..."
-    BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \
-        --node-ips ${NODE0_ADDR} \
-        --node-ports 30000 \
-        --wait-for-all-ports \
-        --timeout 1800"
-
-    if [[ "$DRY_RUN" -eq 1 ]]; then
-        echo "DRY RUN: $BARRIER_CMD"
-    else
-        eval "$BARRIER_CMD"
-    fi
-
-
-    echo "Waiting until proxy server closes..."
-    WAIT_CMD="python3 $SGLANG_WS_PATH/sync.py wait \
-        --remote-ip ${NODE0_ADDR} \
-        --remote-port 30000"
-
-    if [[ "$DRY_RUN" -eq 1 ]]; then
-        echo "DRY RUN: $WAIT_CMD"
-    else
-        eval "$WAIT_CMD"
-    fi
-
-    echo "Killing the rank $RANK decode server"
-    if [[ "$DRY_RUN" -eq 0 ]]; then
-        kill $decode_pid
-    fi
-
+    source "$WS_PATH/server_sglang.sh"
 fi
-
-echo "Script completed successfully"
-exit 0
diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh
new file mode 100755
index 000000000..b410bc978
--- /dev/null
+++ b/benchmarks/multi_node/amd_utils/server_sglang.sh
@@ -0,0 +1,761 @@
+#!/bin/bash
+# SGLang Disaggregated Server Launcher with Model-Specific Configurations
+# =============================================================================
+
+# =============================================================================
+# Environment Configuration
+# =============================================================================
+
+NODE0_ADDR="${NODE0_ADDR:-localhost}"
+NODE_RANK="${NODE_RANK:-0}"
+MODEL_DIR="${MODEL_DIR:-}"
+MODEL_NAME="${MODEL_NAME:-}"
+
+xP="${xP:-1}" #-> Number of Prefill Workers
+yD="${yD:-1}" #-> Number of Decode Workers
+
+IPADDRS="${IPADDRS:-localhost}"
+HEADNODE_PORT="${HEADNODE_PORT:-20000}"
+# Parallelism Configuration
+PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}"
+PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-true}"
+PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-true}"
+DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}"
+DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-true}"
+DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-true}"
+DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-0}"
+
+# Benchmark Configuration
+BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}"
+BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}"
+BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}"
+BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}"
+BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}"
+BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}"
+
+# Dry Run for debugging purpose
+DRY_RUN="${DRY_RUN:-0}"
+
+# GPU count (expandable for different hardware)
+GPUS_PER_NODE="${GPUS_PER_NODE:-8}"
+
+
+# =============================================================================
+# Dependencies and Environment Setup
+# =============================================================================
+source $SGLANG_WS_PATH/env.sh
+
+host_ip=$(ip route get 1.1.1.1 | awk '/src/ {print $7}')
+host_name=$(hostname)
+
+# MORI_RDMA_TC configuration (optional)
+# If set by runner, use it for RDMA traffic class configuration
+# If not set, RDMA operations will proceed without QoS/traffic class settings
+if [[ -n "${MORI_RDMA_TC}" ]]; then
+    echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC for RDMA traffic class configuration"
+    echo "[INFO] Host '$host_name' configured with MORI_RDMA_TC=$MORI_RDMA_TC"
+else
+    echo "[INFO] MORI_RDMA_TC not set. Skipping RDMA traffic class configuration."
+    echo "[INFO] This is normal for clusters without QoS requirements."
+fi
+
+# =============================================================================
+# Model-Specific Configuration from YAML
+# =============================================================================
+MODELS_YAML="${SGLANG_WS_PATH}/models.yaml"
+
+if [[ ! -f "$MODELS_YAML" ]]; then
+    echo "ERROR: models.yaml not found at $MODELS_YAML"
+    exit 1
+fi
+
+# Load model config via inline Python (PyYAML is available in SGLang containers)
+# Formula evaluation (e.g. "SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK * TP * xP")
+# is done here in Python to avoid bash glob-expanding the * characters.
+eval "$(python3 -c "
+import yaml, sys, os
+
+config_path = '${MODELS_YAML}'
+model_name = '${MODEL_NAME}'
+
+with open(config_path) as f:
+    models = yaml.safe_load(f)
+
+if model_name not in models:
+    print(f'echo \"ERROR: Model {model_name} not in models.yaml\"; exit 1')
+    sys.exit(0)
+
+m = models[model_name]
+
+def eval_formula(val):
+    \"\"\"Evaluate chunked_prefill_size: if string, resolve variable names from env and compute.\"\"\"
+    if isinstance(val, (int, float)):
+        return int(val)
+    s = str(val)
+    # Build a namespace from env vars (convert numeric values to int)
+    ns = {}
+    for k, v in os.environ.items():
+        try:
+            ns[k] = int(v)
+        except (ValueError, TypeError):
+            pass
+    try:
+        return int(eval(s, {'__builtins__': {}}, ns))
+    except Exception as e:
+        print(f'echo \"WARNING: Cannot evaluate formula: {s} ({e})\"', file=sys.stderr)
+        return val
+
+def parse_range(cuda_range, default_start, default_end):
+    if '-' in str(cuda_range):
+        s, e = str(cuda_range).split('-')
+        return s, e
+    return str(default_start), str(default_end)
+
+# Output shell variables
+print(f'MODEL_BASE_FLAGS=\"{m.get(\"base_flags\", \"\")}\"')
+print(f'MODEL_MTP_FLAGS=\"{m.get(\"mtp_flags\", \"\")}\"')
+print(f'MODEL_DP_FLAGS=\"{m.get(\"dp_flags\", \"\")}\"')
+
+prefill = m.get('prefill', {})
+decode = m.get('decode', {})
+
+print(f'PREFILL_MEM_FRACTION_STATIC=\"{prefill.get(\"mem_fraction_static\", 0.8)}\"')
+print(f'PREFILL_DISABLE_RADIX_CACHE=\"{prefill.get(\"disable_radix_cache\", True)}\"')
+
+dp = prefill.get('dp', {})
+no_dp = prefill.get('no_dp', {})
+print(f'PREFILL_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 24)}\"')
+print(f'PREFILL_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"')
+print(f'PREFILL_CUDA_GRAPH_BS_DP=\"{dp.get(\"cuda_graph_bs\", \"1 2 3\")}\"')
+print(f'PREFILL_CONTEXT_LENGTH_DP=\"{dp.get(\"context_length\", \"\")}\"')
+print(f'PREFILL_MAX_TOTAL_TOKENS_DP=\"{dp.get(\"max_total_tokens\", \"\")}\"')
+print(f'PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP=\"{dp.get(\"enable_two_batch_overlap\", False)}\"')
+print(f'PREFILL_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"')
+print(f'PREFILL_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"')
+s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128)
+print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"')
+print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"')
+
+print(f'DECODE_MEM_FRACTION_STATIC=\"{decode.get(\"mem_fraction_static\", 0.85)}\"')
+print(f'DECODE_PREFILL_ROUND_ROBIN_BALANCE=\"{decode.get(\"prefill_round_robin_balance\", True)}\"')
+
+dp = decode.get('dp', {})
+ep_only = decode.get('ep_only', {})
+no_dp = decode.get('no_dp', {})
+
+# Decode DP config
+print(f'DECODE_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 4096)}\"')
+print(f'DECODE_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"')
+s, e = parse_range(dp.get('cuda_graph_bs_range', '1-160'), 1, 160)
+print(f'DECODE_CUDA_GRAPH_BS_DP_START=\"{s}\"')
+print(f'DECODE_CUDA_GRAPH_BS_DP_END=\"{e}\"')
+
+# Decode EP-only config (EP enabled but DP disabled)
+print(f'DECODE_MAX_RUNNING_REQUESTS_EP_ONLY=\"{ep_only.get(\"max_running_requests\", 256)}\"')
+print(f'DECODE_CHUNKED_PREFILL_SIZE_EP_ONLY=\"{eval_formula(ep_only.get(\"chunked_prefill_size\", 262144))}\"')
+s, e = parse_range(ep_only.get('cuda_graph_bs_range', '1-256'), 1, 256)
+print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_START=\"{s}\"')
+print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_END=\"{e}\"')
+
+# Decode no-DP config
+print(f'DECODE_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"')
+print(f'DECODE_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"')
+s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128)
+print(f'DECODE_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"')
+print(f'DECODE_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"')
+")"
+
+echo "Loaded model configuration for: $MODEL_NAME"
+
+# Compute DP-dependent prefill parameters
+if [[ "$PREFILL_ENABLE_DP" == "true" ]]; then
+    prefill_cuda_graph_bs=($PREFILL_CUDA_GRAPH_BS_DP)
+    prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_DP
+    prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_DP
+    prefill_context_length=$PREFILL_CONTEXT_LENGTH_DP
+    prefill_max_total_tokens=$PREFILL_MAX_TOTAL_TOKENS_DP
+    prefill_enable_two_batch_overlap=$PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP
+else
+    prefill_cuda_graph_bs=($(seq $PREFILL_CUDA_GRAPH_BS_NO_DP_START $PREFILL_CUDA_GRAPH_BS_NO_DP_END))
+    prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_NO_DP
+    prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_NO_DP
+    prefill_context_length=""
+    prefill_max_total_tokens=""
+    prefill_enable_two_batch_overlap="false"
+fi
+
+# Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp)
+if [[ "$DECODE_ENABLE_DP" == "true" ]]; then
+    decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_DP_START $DECODE_CUDA_GRAPH_BS_DP_END))
+    decode_max_running_requests=$((DECODE_CUDA_GRAPH_BS_DP_END * DECODE_TP_SIZE))
+elif [[ "$DECODE_ENABLE_EP" == "true" ]]; then
+    decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_EP_ONLY_START $DECODE_CUDA_GRAPH_BS_EP_ONLY_END))
+    decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_EP_ONLY
+else
+    decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_NO_DP_START $DECODE_CUDA_GRAPH_BS_NO_DP_END))
+    decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP
+fi
+
+# Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS)
+PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} "
+if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then
+    PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache"
+fi
+if [[ -n "$prefill_context_length" ]]; then
+    PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --context-length ${prefill_context_length}"
+fi
+if [[ -n "$prefill_max_total_tokens" ]]; then
+    PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --max-total-tokens ${prefill_max_total_tokens}"
+fi
+if [[ "$prefill_enable_two_batch_overlap" == "True" ]] || [[ "$prefill_enable_two_batch_overlap" == "true" ]]; then
+    PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --enable-two-batch-overlap"
+    PREFILL_SDMA_ENV="MORI_ENABLE_SDMA=true"
+fi
+
+DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} "
+
+if [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "True" ]] || [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "true" ]]; then
+    DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --prefill-round-robin-balance"
+fi
+
+if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then
+    MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1)))
+    MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MOE_MAX_INPUT_TOKENS_DECODE * (DECODE_MTP_SIZE + 1)))
+fi
+
+# =============================================================================
+# Cluster Topology Configuration
+# =============================================================================
+IFS=',' read -ra IP_ARRAY <<< "$IPADDRS"
+
+# Ceiling division by GPUS_PER_NODE for nodes-per-worker
+PREFILL_NODES_PER_WORKER=$(((PREFILL_TP_SIZE + 7) / GPUS_PER_NODE))
+DECODE_NODES_PER_WORKER=$(((DECODE_TP_SIZE + 7) / GPUS_PER_NODE))
+NODE_OFFSET=$((PREFILL_NODES_PER_WORKER * xP))
+
+# Build prefill arguments dynamically based on xP
+PREFILL_HEADNODE_URLS=()
+PREFILL_ARGS=""
+for i in $(seq 0 $((xP - 1))); do
+    prefill_idx=$((i * PREFILL_NODES_PER_WORKER))
+    PREFILL_HEADNODE_URLS[$i]="${IP_ARRAY[$prefill_idx]}:${HEADNODE_PORT}"
+    PREFILL_ARGS="$PREFILL_ARGS --prefill http://${IP_ARRAY[$prefill_idx]}:8000"
+done
+
+# Build decode arguments dynamically based on yD
+DECODE_HEADNODE_URLS=()
+DECODE_ARGS=""
+for i in $(seq 0 $((yD - 1))); do
+    decode_idx=$((i * DECODE_NODES_PER_WORKER + NODE_OFFSET))
+    DECODE_HEADNODE_URLS[$i]="${IP_ARRAY[$decode_idx]}:${HEADNODE_PORT}"
+    DECODE_ARGS="$DECODE_ARGS --decode http://${IP_ARRAY[$decode_idx]}:8000"
+done
+
+echo "Prefill worker headnode list: ${PREFILL_HEADNODE_URLS[@]}"
+echo "Decode  worker headnode list: ${DECODE_HEADNODE_URLS[@]}"
+
+# =============================================================================
+# Configuration Builder Functions
+# =============================================================================
+
+build_server_config() {
+    local mode="$1"
+    local model_name="$2"
+    local tp_size="$3"
+    local enable_ep="$4"
+    local enable_dp="$5"
+    local decode_mtp_size="$6"
+
+    # Calculate EP and DP sizes based on enable flags
+    local ep_size=1
+    local dp_size=1
+
+    if [[ "$enable_ep" == "true" ]]; then
+        ep_size=$tp_size
+    fi
+
+    if [[ "$enable_dp" == "true" ]]; then
+        dp_size=$tp_size
+    fi
+
+    # Build parallelism arguments
+    local parallel_args="--tp-size ${tp_size}"
+
+    if [[ "$enable_ep" == "true" ]]; then
+        parallel_args="$parallel_args --ep-size ${ep_size}"
+    fi
+
+    if [[ "$enable_dp" == "true" ]]; then
+        parallel_args="$parallel_args --dp-size ${dp_size}"
+    fi
+
+    # Get model-specific configuration from YAML-loaded variables
+    local base_config="$MODEL_BASE_FLAGS"
+    local mtp_config=""
+    local dp_config=""
+    local specific_config=""
+
+    # MTP config (only if MTP is enabled and mode is decode)
+    if [ "$decode_mtp_size" -gt 0 ]; then
+        mtp_config="${MODEL_MTP_FLAGS} --speculative-num-steps ${decode_mtp_size} --speculative-num-draft-tokens $((decode_mtp_size + 1))"
+    fi
+
+    # DP config (only if DP is enabled)
+    if [[ "$enable_dp" == "true" ]]; then
+        dp_config="$MODEL_DP_FLAGS"
+    fi
+
+    # Mode-specific config
+    if [[ "$mode" == "prefill" ]]; then
+        specific_config="$PREFILL_MODE_FLAGS"
+    elif [[ "$mode" == "decode" ]]; then
+        specific_config="$DECODE_MODE_FLAGS"
+    fi
+
+    # Combine: parallel args + base config + mtp config (decode only) + dp config + specific config
+    local full_config="$parallel_args"
+    if [[ -n "$base_config" ]]; then
+        full_config="$full_config $base_config"
+    fi
+    if [[ -n "$mtp_config" ]] && [[ "$mode" == "decode" ]]; then
+        full_config="$full_config $mtp_config"
+    fi
+    if [[ -n "$dp_config" ]]; then
+        full_config="$full_config $dp_config"
+    fi
+    if [[ -n "$specific_config" ]]; then
+        full_config="$full_config $specific_config"
+    fi
+
+    echo "$full_config"
+}
+
+# Build complete server configurations
+PREFILL_SERVER_CONFIG=$(build_server_config "prefill" "$MODEL_NAME" "$PREFILL_TP_SIZE" "$PREFILL_ENABLE_EP" "$PREFILL_ENABLE_DP" "$DECODE_MTP_SIZE")
+DECODE_SERVER_CONFIG=$(build_server_config "decode" "$MODEL_NAME" "$DECODE_TP_SIZE" "$DECODE_ENABLE_EP" "$DECODE_ENABLE_DP" "$DECODE_MTP_SIZE")
+
+if [[ -n "$MODEL_NAME" ]]; then
+    echo "Using model-specific configuration for: $MODEL_NAME"
+fi
+
+if [[ "${EVAL_ONLY:-false}" == "true" ]] || [[ "${RUN_EVAL:-false}" == "true" ]]; then
+    PREFILL_SERVER_CONFIG=$(echo "$PREFILL_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g')
+    DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g')
+    unset MORI_MOE_MAX_INPUT_TOKENS_PREFILL
+    unset MORI_MOE_MAX_INPUT_TOKENS_DECODE
+    # NOTE: that currently with fp8_combine set, the evals do not pass on InferenceX eval harness
+    # or on SGLang native harness for high concurrency 4k and gets no where near the golden score of
+    # 0.95 on even basic GSM8k grade school math as confirmed by @billishyahao from AMD
+    # and as confirmed by @Oseltamivir. This was initally merged with @billishyahao promising 
+    # that an fast follow PR to fix the evals via having quant correction in the fp8 combine
+fi
+
+# =============================================================================
+# Container Synchronization
+# =============================================================================
+
+echo "Waiting at the container creation barrier on $host_name"
+python3 $SGLANG_WS_PATH/sync.py barrier \
+    --local-ip ${host_ip} \
+    --local-port 5000 \
+    --enable-port \
+    --node-ips ${IPADDRS} \
+    --node-ports 5000 \
+    --wait-for-all-ports \
+    --timeout 300
+
+
+# =============================================================================
+# Node Role Assignment and Server Launch
+# =============================================================================
+
+if [ "$NODE_RANK" -eq 0 ]; then
+    echo "NODE INFO ======================================="
+    echo "================================================"
+    echo "Node List : ${SLURM_JOB_NODELIST}"
+    echo "Node IPs : ${IPADDRS}"
+    echo "Model Name : ${MODEL_NAME:-'Not specified'}"
+    echo "================================================"
+
+    echo "CLUSTER INFO ===================================="
+    echo "================================================"
+    echo "${host_name}:${host_ip} is Proxy Node and Prefill Node"
+    echo "Using prefill config: $PREFILL_SERVER_CONFIG"
+    echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}"
+    echo "Decode  parallelism: TP=${DECODE_TP_SIZE},  EP enabled: ${DECODE_ENABLE_EP},  DP enabled: ${DECODE_ENABLE_DP},  MTP size=${DECODE_MTP_SIZE}"
+    echo "Prefill servers ($((PREFILL_TP_SIZE/GPUS_PER_NODE)) nodes): ${PREFILL_ARGS}"
+    echo "Decode servers  ($((DECODE_TP_SIZE/GPUS_PER_NODE))  nodes): ${DECODE_ARGS}"
+    echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL}"
+    echo "Decode  env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} "
+    echo "Decode  env: SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE} "
+
+    echo "================================================"
+
+    # start the head prefill server
+    PREFILL_MORI_MOE_ENV=""
+    set -x
+    if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" ]]; then
+        PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}"
+    fi
+    set +x
+    PREFILL_CMD="${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
+        --model-path $MODEL_DIR/$MODEL_NAME \
+        --disaggregation-mode prefill \
+        --disaggregation-ib-device ${IBDEVICES} \
+        --host 0.0.0.0 \
+        --port 8000 \
+        --trust-remote-code \
+        ${PREFILL_SERVER_CONFIG} "
+
+    if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then
+        PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[0]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank 0"
+    fi
+
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $PREFILL_CMD"
+    else
+        set -x
+        eval "$PREFILL_CMD" \
+            2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log &
+        set +x
+        prefill0_pid=$!
+    fi
+
+
+    echo "Waiting for all prefill and decode servers to be up . . ."
+
+
+    BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \
+        --node-ips ${IPADDRS} \
+        --node-ports 8000 \
+        --wait-for-all-ports \
+        --timeout 1800"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $BARRIER_CMD"
+    else
+        eval "$BARRIER_CMD"
+    fi
+    echo "Congratulations!!! All prefill and decode servers are up . . ."
+
+    ROUTER_CMD="python -m sglang_router.launch_router \
+        --pd-disaggregation \
+        --port 30000 \
+        --policy random \
+        --prefill-policy random \
+        --decode-policy random \
+        ${PREFILL_ARGS} \
+        ${DECODE_ARGS}"
+
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $ROUTER_CMD"
+    else
+        ROUTER_LOG_FILE="/tmp/slurm_job-${SLURM_JOB_ID}_proxy_${host_name}.log"
+        set -x
+        if [[ "${SGLANG_ROUTER_STDOUT_LOGS:-0}" == "1" ]]; then
+            eval "$ROUTER_CMD" 2>&1 | tee "$ROUTER_LOG_FILE" &
+        else
+            eval "$ROUTER_CMD" >"$ROUTER_LOG_FILE" 2>&1 &
+        fi
+        set +x
+        proxy_pid=$!
+
+        # Wait for router to be ready via health endpoint
+        HEALTH_BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \
+            --node-ips ${NODE0_ADDR} \
+            --node-ports 30000 \
+            --wait-for-all-health \
+            --health-endpoint /readiness \
+            --timeout 1800"
+
+        if [[ "$DRY_RUN" -eq 1 ]]; then
+            echo "DRY RUN: $HEALTH_BARRIER_CMD"
+        else
+            eval "$HEALTH_BARRIER_CMD"
+        fi
+
+        echo "Router is ready for benchmarking"
+    fi
+
+
+    echo "Ready for benchmarking on ${host_name}:${host_ip}"
+
+    echo "Benchmarking on ${host_name}:${host_ip}"
+    cd $SGLANG_WS_PATH
+
+    # Export IS_MTP based on whether MTP is enabled
+    if [ "$DECODE_MTP_SIZE" -gt 0 ]; then
+        export IS_MTP=true
+    else
+        export IS_MTP=false
+    fi
+
+    # n_prefill n_decode prefill_gpus decode_gpus model_dir model_name log_path isl osl concurrency_list req_rate random_range_ratio num_prompts_multiplier
+    BENCH_CMD="bash $SGLANG_WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \
+        $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \
+        ${BENCH_OUTPUT_LEN} "${BENCH_MAX_CONCURRENCY}" ${BENCH_REQUEST_RATE} \
+        ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}"
+
+    if [[ "${EVAL_ONLY:-false}" == "true" ]]; then
+        echo "EVAL_ONLY mode: skipping throughput benchmark"
+    elif [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $BENCH_CMD"
+    else
+        set -x
+        eval "$BENCH_CMD"
+        set +x
+    fi
+
+    # Run evaluation if requested (before killing router)
+    if [[ "${RUN_EVAL:-false}" == "true" ]]; then
+        echo "Running lm-eval evaluation on Node 0..."
+
+        # Health check: verify the router is still serving before running eval.
+        # The throughput benchmark may have crashed/exhausted decode workers.
+        EVAL_HEALTH_OK=false
+        for _attempt in 1 2 3; do
+            if curl -sf --max-time 10 "http://0.0.0.0:30000/readiness" >/dev/null 2>&1; then
+                EVAL_HEALTH_OK=true
+                break
+            fi
+            echo "Eval health check attempt $_attempt failed, retrying in 10s..."
+            sleep 10
+        done
+
+        if [[ "$EVAL_HEALTH_OK" != "true" ]]; then
+            echo "WARNING: Router health check failed after 3 attempts. Skipping eval."
+        else
+            # Must run from repo root so utils/evals/${task}.yaml resolves
+            pushd /workspace
+
+            # Source eval functions from benchmark_lib.sh
+            source /workspace/benchmarks/benchmark_lib.sh
+
+            # Use EVAL_CONC from workflow if set, otherwise fall back to max of conc list
+            if [[ -n "${EVAL_CONC:-}" ]]; then
+                export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC}"
+            else
+                export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1)
+            fi
+
+            # Override eval context length with model's configured context_length
+            if [[ -n "$prefill_context_length" ]]; then
+                export EVAL_MAX_MODEL_LEN="$prefill_context_length"
+            fi
+
+            if [[ "$DRY_RUN" -eq 1 ]]; then
+                echo "DRY RUN: run_eval --framework lm-eval --port 30000 (conc=${EVAL_CONCURRENT_REQUESTS}, ctx=${EVAL_MAX_MODEL_LEN:-auto})"
+            else
+                # Run lm-eval against the router on port 30000
+                run_eval --framework lm-eval --port 30000
+                eval_rc=$?
+
+                if [[ $eval_rc -ne 0 ]]; then
+                    echo "ERROR: run_eval exited rc=$eval_rc; skipping metadata write and eval artifact staging" >&2
+                    EVAL_FAILED=1
+                else
+                    # Set metadata env vars for append_lm_eval_summary
+                    export TP="${PREFILL_TP_SIZE}"
+                    export CONC="${EVAL_CONCURRENT_REQUESTS}"
+                    export EP_SIZE=1
+                    [[ "${PREFILL_ENABLE_EP}" == "true" ]] && EP_SIZE="${PREFILL_TP_SIZE}"
+                    export PREFILL_TP="${PREFILL_TP_SIZE}"
+                    export PREFILL_EP=1
+                    [[ "${PREFILL_ENABLE_EP}" == "true" ]] && PREFILL_EP="${PREFILL_TP_SIZE}"
+                    export PREFILL_NUM_WORKERS="${xP}"
+                    export DECODE_TP="${DECODE_TP_SIZE}"
+                    export DECODE_EP=1
+                    [[ "${DECODE_ENABLE_EP}" == "true" ]] && DECODE_EP="${DECODE_TP_SIZE}"
+                    export DECODE_NUM_WORKERS="${yD}"
+                    export DP_ATTENTION="${PREFILL_ENABLE_DP}"
+                    export PREFILL_DP_ATTENTION="${PREFILL_ENABLE_DP}"
+                    export DECODE_DP_ATTENTION="${DECODE_ENABLE_DP}"
+                    export ISL="${BENCH_INPUT_LEN}"
+                    export OSL="${BENCH_OUTPUT_LEN}"
+                    # IS_MULTINODE, FRAMEWORK, PRECISION, MODEL_PREFIX, RUNNER_TYPE,
+                    # RESULT_FILENAME are already set via Docker -e flags from job.slurm
+
+                    append_lm_eval_summary
+                    # Files (meta_env.json, results*.json, sample*.jsonl) are now in /workspace
+
+                    # Copy eval artifacts to run_logs for NFS extraction by runner
+                    EVAL_COPY_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/eval_results"
+                    mkdir -p "$EVAL_COPY_DIR"
+                    for f in meta_env.json; do
+                        [ -e "/workspace/$f" ] && cp -f "/workspace/$f" "$EVAL_COPY_DIR/"
+                    done
+                    # Use find for glob patterns to avoid "no match" errors
+                    find /workspace -maxdepth 1 -name 'results*.json' -exec cp -f {} "$EVAL_COPY_DIR/" \;
+                    find /workspace -maxdepth 1 -name 'sample*.jsonl' -exec cp -f {} "$EVAL_COPY_DIR/" \;
+
+                    echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR"
+                fi
+            fi
+
+            popd
+        fi
+    fi
+
+    # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host)
+    LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs"
+    mkdir -p "$LOGS_OUTPUT"
+
+    if [[ "$DRY_RUN" -eq 0 ]]; then
+        cp -r /run_logs/slurm_job-${SLURM_JOB_ID} "$LOGS_OUTPUT/"
+        echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}"
+    fi
+
+    echo "Killing the proxy server and prefill server"
+
+    if [[ "$DRY_RUN" -eq 0 ]]; then
+        kill $proxy_pid
+        kill $prefill0_pid
+    fi
+
+    if [[ "${EVAL_FAILED:-0}" -eq 1 ]]; then
+        echo "ERROR: eval failed; exiting node-0 with rc=1"
+        exit 1
+    fi
+
+elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then
+    echo "${host_name}:${host_ip} is Prefill Node (Model: ${MODEL_NAME:-'default'})"
+    echo "Using prefill config: $PREFILL_SERVER_CONFIG"
+    echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}"
+
+    PREFILL_MORI_MOE_ENV=""
+    set -x
+    if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" ]]; then
+        PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}"
+    fi
+    set +x
+    PREFILL_CMD="${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
+        --model-path $MODEL_DIR/${MODEL_NAME} \
+        --disaggregation-mode prefill \
+        --disaggregation-ib-device ${IBDEVICES} \
+        --host 0.0.0.0 \
+        --port 8000 \
+        --trust-remote-code \
+        ${PREFILL_SERVER_CONFIG} "
+
+    if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then
+        rank=$((NODE_RANK % PREFILL_NODES_PER_WORKER))
+        prefill_idx=$((NODE_RANK / PREFILL_NODES_PER_WORKER))
+        PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[$prefill_idx]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank $rank"
+    fi
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $PREFILL_CMD"
+    else
+        set -x
+        eval "$PREFILL_CMD" \
+            2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log &
+        set +x
+        prefill_pid=$!
+    fi
+
+    echo "Waiting for proxy server to be up..."
+    BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \
+        --node-ips ${NODE0_ADDR} \
+        --node-ports 30000 \
+        --wait-for-all-ports \
+        --timeout 1800"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $BARRIER_CMD"
+    else
+        eval "$BARRIER_CMD"
+    fi
+
+    echo "Waiting until proxy server closes..."
+    WAIT_CMD="python3 $SGLANG_WS_PATH/sync.py wait \
+        --remote-ip ${NODE0_ADDR} \
+        --remote-port 30000"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $WAIT_CMD"
+    else
+        eval "$WAIT_CMD"
+    fi
+
+    echo "Killing the rank $NODE_RANK prefill server"
+
+    if [[ "$DRY_RUN" -eq 0 ]]; then
+        kill $prefill_pid
+    fi
+
+else
+    RANK=$((NODE_RANK - xP * PREFILL_NODES_PER_WORKER))
+    echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME:-'default'})"
+    echo "Using decode config: $DECODE_SERVER_CONFIG"
+    echo "Decode node rank: $RANK"
+    echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}"
+
+    DECODE_MORI_MOE_ENV=""
+    set -x
+    if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_DECODE" ]]; then
+        DECODE_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE}"
+    fi
+    set +x
+    DECODE_CMD="${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \
+        --model-path ${MODEL_DIR}/${MODEL_NAME} \
+        --disaggregation-mode decode \
+        --disaggregation-ib-device ${IBDEVICES} \
+        --host 0.0.0.0 \
+        --port 8000 \
+        --trust-remote-code \
+        ${DECODE_SERVER_CONFIG} "
+
+    if [ "$DECODE_NODES_PER_WORKER" -gt 1 ]; then
+        rank=$((RANK % DECODE_NODES_PER_WORKER))
+        decode_idx=$((RANK / DECODE_NODES_PER_WORKER))
+        DECODE_CMD="$DECODE_CMD --dist-init-addr ${DECODE_HEADNODE_URLS[$decode_idx]} --nnodes ${DECODE_NODES_PER_WORKER} --node-rank $rank"
+    fi
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $DECODE_CMD"
+    else
+        set -x
+        eval "$DECODE_CMD" \
+            2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log &
+
+        set +x
+        decode_pid=$!
+    fi
+
+
+    echo "Waiting for proxy server to be up..."
+    BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \
+        --node-ips ${NODE0_ADDR} \
+        --node-ports 30000 \
+        --wait-for-all-ports \
+        --timeout 1800"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $BARRIER_CMD"
+    else
+        eval "$BARRIER_CMD"
+    fi
+
+
+    echo "Waiting until proxy server closes..."
+    WAIT_CMD="python3 $SGLANG_WS_PATH/sync.py wait \
+        --remote-ip ${NODE0_ADDR} \
+        --remote-port 30000"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $WAIT_CMD"
+    else
+        eval "$WAIT_CMD"
+    fi
+
+    echo "Killing the rank $RANK decode server"
+    if [[ "$DRY_RUN" -eq 0 ]]; then
+        kill $decode_pid
+    fi
+
+fi
+
+echo "Script completed successfully"
+exit 0
\ No newline at end of file
diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh
new file mode 100755
index 000000000..4f07b1257
--- /dev/null
+++ b/benchmarks/multi_node/amd_utils/server_vllm.sh
@@ -0,0 +1,561 @@
+#!/bin/bash
+# vLLM Disaggregated Server Launcher with Model-Specific Configurations
+# =============================================================================
+#
+# Node role assignment (by NODE_RANK):
+#   0           -> Proxy/Router + first Prefill node  (kv_producer)
+#   1..xP-1     -> Additional Prefill nodes            (kv_producer)
+#   xP..xP+yD-1 -> Decode nodes                        (kv_consumer)
+#
+# Total nodes = xP + yD (router co-located with first prefill, like SGLang).
+
+# =============================================================================
+# Dependency Setup (idempotent; required when using base vLLM image)
+# =============================================================================
+source "$(dirname "${BASH_SOURCE[0]}")/setup_deps.sh"
+
+# =============================================================================
+# Environment Configuration
+# =============================================================================
+
+NODE0_ADDR="${NODE0_ADDR:-localhost}"
+NODE_RANK="${NODE_RANK:-0}"
+MODEL_DIR="${MODEL_DIR:-}"
+MODEL_NAME="${MODEL_NAME:-}"
+
+xP="${xP:-1}"
+yD="${yD:-1}"
+
+IPADDRS="${IPADDRS:-localhost}"
+
+# Benchmark Configuration
+BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}"
+BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}"
+BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}"
+BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}"
+BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}"
+BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}"
+
+DRY_RUN="${DRY_RUN:-0}"
+GPUS_PER_NODE="${GPUS_PER_NODE:-8}"
+
+ROUTER_PORT="${ROUTER_PORT:-30000}"
+SERVER_PORT="${SERVER_PORT:-2584}"
+ENGINE_ID="${ENGINE_ID:-${MODEL_NAME}-pd-run}"
+
+# Prefer MODEL_PATH from job.slurm (handles HF cache snapshot resolution)
+MODEL_PATH="${MODEL_PATH:-${MODEL_DIR}/${MODEL_NAME}}"
+
+# =============================================================================
+# Dependencies and Environment Setup
+# =============================================================================
+source $WS_PATH/env.sh
+
+host_ip=$(ip route get 1.1.1.1 2>/dev/null | awk '/src/ {print $7}')
+# RDMA IP for Nixl KV transfer (prefer 192.168.x.x subnet if available)
+rdma_ip=$(hostname -I | tr ' ' '\n' | grep '^192\.168\.' | head -1)
+rdma_ip="${rdma_ip:-$host_ip}"
+host_name=$(hostname)
+
+echo "[INFO] Management IP (barriers/proxy): $host_ip"
+echo "[INFO] RDMA IP (Nixl KV transfer): $rdma_ip"
+
+# =============================================================================
+# RDMA / Nixl Workarounds
+# =============================================================================
+
+setup_rdma_env() {
+    # Pensando ionic (RoCEv2) point-to-point /31 route fix.
+    # Each benic interface has a /31 to the TOR switch. Without explicit routes,
+    # traffic to other nodes' RDMA IPs falls through to the management network.
+    if [[ "$rdma_ip" =~ ^192\.168\.([0-9]+)\.([0-9]+)$ ]]; then
+        local rdma_subnet="${BASH_REMATCH[1]}"
+        local rdma_host="${BASH_REMATCH[2]}"
+        local rdma_gw="192.168.${rdma_subnet}.$(( rdma_host | 1 ))"
+        local rdma_iface
+        rdma_iface=$(ip -o addr show | awk -v ip="$rdma_ip" '$4 ~ ip {print $2}' | head -1)
+        if [[ -n "$rdma_iface" ]]; then
+            ip route replace "192.168.${rdma_subnet}.0/24" via "$rdma_gw" dev "$rdma_iface" 2>/dev/null && \
+                echo "[RDMA-ROUTE] Added 192.168.${rdma_subnet}.0/24 via $rdma_gw dev $rdma_iface" || \
+                echo "[RDMA-ROUTE] Route add failed for 192.168.${rdma_subnet}.0/24"
+        fi
+    fi
+
+    # Patch Nixl UCX backend: set ucx_error_handling_mode=none.
+    # Required for ALL NIC types under high concurrency (C512+). Without this,
+    # UCX's default UCP_ERR_HANDLING_MODE_PEER triggers transport-level error
+    # recovery on ibv_post_send failures, preventing RIXL RDMA READ retries from
+    # recovering gracefully. This causes the prefill KV cache to fill to 100%
+    # and deadlock the pipeline. On ionic NICs this was already applied (rdmacm
+    # incompatibility); on mlx5 NICs it was incorrectly skipped.
+    local nixl_api
+    nixl_api=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null)
+    if [[ -n "$nixl_api" ]]; then
+        if ! grep -q 'ucx_error_handling_mode' "$nixl_api"; then
+            sed -i '/self\.create_backend(bknd, init)/i\                init["ucx_error_handling_mode"] = "none"' "$nixl_api"
+            echo "[PATCH] Added ucx_error_handling_mode=none to $nixl_api (IBDEVICES=${IBDEVICES:-unset})"
+        else
+            echo "[PATCH] ucx_error_handling_mode already set in $nixl_api"
+        fi
+    fi
+}
+
+setup_rdma_env
+
+if [[ -z "$UCX_NET_DEVICES" ]]; then
+    echo "Error: UCX_NET_DEVICES is empty after env.sh detection" >&2
+    exit 1
+fi
+
+# =============================================================================
+# Model-Specific Configuration from YAML
+# =============================================================================
+MODELS_YAML="${WS_PATH}/models_vllm.yaml"
+
+if [[ ! -f "$MODELS_YAML" ]]; then
+    echo "ERROR: models.yaml not found at $MODELS_YAML"
+    exit 1
+fi
+
+if [[ -z "$MODEL_NAME" ]]; then
+    echo "ERROR: MODEL_NAME is not set"; exit 1
+fi
+
+eval "$(python3 -c "
+import yaml, sys
+
+with open('${MODELS_YAML}') as f:
+    models = yaml.safe_load(f)
+
+model_name = '${MODEL_NAME}'
+if model_name not in models:
+    print(f'echo \"ERROR: Model {model_name} not in models.yaml\"; exit 1')
+    sys.exit(0)
+
+m = models[model_name]
+
+def bash_escape(s):
+    \"\"\"Escape a value for safe embedding in a bash double-quoted assignment.\"\"\"
+    return s.replace('\\\\', '\\\\\\\\').replace('\"', '\\\\\"').replace('\$', '\\\\\$').replace('\`', '\\\\\`')
+
+pf = bash_escape(m.get('prefill_flags', '--tensor-parallel-size 8'))
+df = bash_escape(m.get('decode_flags', '--tensor-parallel-size 8'))
+ev = bash_escape(m.get('env', ''))
+dev = bash_escape(m.get('decode_env', ''))
+print(f'PREFILL_SERVER_CONFIG=\"{pf}\"')
+print(f'DECODE_SERVER_CONFIG=\"{df}\"')
+print(f'MODEL_ENVS=\"{ev}\"')
+print(f'DECODE_MODEL_ENVS=\"{dev}\"')
+")"
+
+echo "Loaded model configuration for: $MODEL_NAME"
+
+# Apply tensor-parallel size and EP/DP flags from submit pipeline.
+if [[ -n "${PREFILL_TP_SIZE:-}" ]]; then
+    if echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--tensor-parallel-size'; then
+        PREFILL_SERVER_CONFIG=$(echo "$PREFILL_SERVER_CONFIG" | sed -E "s/--tensor-parallel-size[[:space:]]+[0-9]+/--tensor-parallel-size ${PREFILL_TP_SIZE}/g")
+    else
+        PREFILL_SERVER_CONFIG+=" --tensor-parallel-size ${PREFILL_TP_SIZE}"
+    fi
+fi
+if [[ -n "${DECODE_TP_SIZE:-}" ]]; then
+    if echo "$DECODE_SERVER_CONFIG" | grep -q -- '--tensor-parallel-size'; then
+        DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed -E "s/--tensor-parallel-size[[:space:]]+[0-9]+/--tensor-parallel-size ${DECODE_TP_SIZE}/g")
+    else
+        DECODE_SERVER_CONFIG+=" --tensor-parallel-size ${DECODE_TP_SIZE}"
+    fi
+fi
+if [[ "${PREFILL_ENABLE_EP:-false}" == "true" ]] && ! echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--enable-expert-parallel'; then
+    PREFILL_SERVER_CONFIG+=" --enable-expert-parallel"
+fi
+if [[ "${PREFILL_ENABLE_DP:-false}" == "true" ]] && ! echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--enable-dp-attention'; then
+    PREFILL_SERVER_CONFIG+=" --enable-dp-attention"
+fi
+if [[ "${DECODE_ENABLE_EP:-false}" == "true" ]] && ! echo "$DECODE_SERVER_CONFIG" | grep -q -- '--enable-expert-parallel'; then
+    DECODE_SERVER_CONFIG+=" --enable-expert-parallel"
+fi
+if [[ "${DECODE_ENABLE_DP:-false}" == "true" ]] && ! echo "$DECODE_SERVER_CONFIG" | grep -q -- '--enable-dp-attention'; then
+    DECODE_SERVER_CONFIG+=" --enable-dp-attention"
+fi
+
+echo "PREFILL_SERVER_CONFIG (after TP/EP/DP): $PREFILL_SERVER_CONFIG"
+echo "DECODE_SERVER_CONFIG (after TP/EP/DP): $DECODE_SERVER_CONFIG"
+
+# =============================================================================
+# Container Synchronization
+# =============================================================================
+
+BARRIER_PORT="${BARRIER_PORT:-36380}"
+echo "Waiting at the container creation barrier on $host_name (port $BARRIER_PORT)"
+python3 $WS_PATH/sync.py barrier \
+    --local-ip ${host_ip} \
+    --local-port ${BARRIER_PORT} \
+    --enable-port \
+    --node-ips ${IPADDRS} \
+    --node-ports ${BARRIER_PORT} \
+    --wait-for-all-ports \
+    --timeout 600
+
+# =============================================================================
+# ETCD Server Setup
+# =============================================================================
+
+# echo "Proceeding to start etcd server on $host_name"
+# bash ${WS_PATH}/start_etcd.sh > /dev/null 2>&1 &
+# etcd_pid=$!
+
+# echo "Waiting at etcd server barrier on $host_name"
+# python3 $WS_PATH/sync.py barrier \
+#     --node-ips ${IPADDRS} \
+#     --node-ports 2379 \
+#     --wait-for-all-ports \
+#     --timeout 300
+
+# echo "All etcd servers are up : $host_name"
+# sleep 3
+
+# echo "etcd endpoint health=================="
+# etcdctl endpoint health 2>&1 || /usr/local/bin/etcd/etcdctl endpoint health 2>&1 || true
+# echo "======================================"
+
+# python3 $WS_PATH/sync.py barrier \
+#     --node-ips ${IPADDRS} \
+#     --node-ports 2379 \
+#     --wait-for-all-ports \
+#     --timeout 300
+
+# =============================================================================
+# Cluster Topology Configuration
+# =============================================================================
+IFS=',' read -ra IP_ARRAY <<< "$IPADDRS"
+
+PREFILL_ARGS=""
+DECODE_ARGS=""
+
+for ((i=0; i<xP && i<${#IP_ARRAY[@]}; i++)); do
+    PREFILL_ARGS+="${IP_ARRAY[$i]} "
+done
+
+for ((i=xP; i<${#IP_ARRAY[@]}; i++)); do
+    DECODE_ARGS+="${IP_ARRAY[$i]} "
+done
+
+echo "Prefill node IPs: ${PREFILL_ARGS}"
+echo "Decode  node IPs: ${DECODE_ARGS}"
+
+# MoRI-IO proxy ZMQ registration port (must match vllm-router --vllm-discovery-address)
+PROXY_PING_PORT="${PROXY_PING_PORT:-36367}"
+
+# vLLM environment (UCX transport vars are set at the Docker level in job.slurm)
+setup_vllm_env() {
+    export VLLM_USE_V1=1
+    export VLLM_SERVER_DEV_MODE=0
+    export VLLM_NIXL_SIDE_CHANNEL_HOST=${rdma_ip}
+    export VLLM_NIXL_SIDE_CHANNEL_PORT=5600
+    # Workaround: disable request-ID randomization so MoRI-IO connector can
+    # match completion IDs between prefill and decode without PR #34907 patch.
+    export VLLM_DISABLE_REQUEST_ID_RANDOMIZATION=1
+    for env_pair in ${MODEL_ENVS}; do
+        export "$env_pair"
+    done
+}
+
+# =============================================================================
+# Node Role Assignment and Server Launch
+# =============================================================================
+
+if [ "$NODE_RANK" -eq 0 ]; then
+    echo "NODE INFO ======================================="
+    echo "================================================"
+    echo "Node List : ${SLURM_JOB_NODELIST}"
+    echo "Node IPs  : ${IPADDRS}"
+    echo "Model     : ${MODEL_NAME:-'Not specified'}"
+    echo "================================================"
+
+    echo "CLUSTER INFO ===================================="
+    echo "================================================"
+    echo "${host_name}:${host_ip} is Proxy Node and Prefill Node"
+    echo "Using prefill config: $PREFILL_SERVER_CONFIG"
+    echo "Prefill servers: ${PREFILL_ARGS}"
+    echo "Decode  servers: ${DECODE_ARGS}"
+    echo "================================================"
+
+    setup_vllm_env
+
+    # Router is started as an external container by job.slurm (VLLM_ROUTER_IMAGE)
+    echo "Using external vllm-router container (started by job.slurm on this node)"
+
+    SERVED_MODEL="${MODEL_NAME}"
+    PREFILL_CMD="vllm serve ${MODEL_PATH} \
+        --served-model-name ${SERVED_MODEL} \
+        --port $SERVER_PORT \
+        --trust-remote-code \
+        --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \
+        ${PREFILL_SERVER_CONFIG}"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $PREFILL_CMD"
+    else
+        PREFILL_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log"
+        set -x
+        eval "$PREFILL_CMD" > "$PREFILL_LOG_FILE" 2>&1 &
+        set +x
+        prefill_pid=$!
+    fi
+
+    echo "Waiting for all prefill and decode servers to be up . . ."
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: skipping barrier (wait-for-all-ports)"
+    else
+        python3 $WS_PATH/sync.py barrier \
+            --node-ips ${IPADDRS} \
+            --node-ports $SERVER_PORT \
+            --wait-for-all-ports \
+            --timeout 1800
+    fi
+
+    echo "Congratulations!!! All prefill and decode servers are up . . ."
+
+    # Wait for proxy /health to confirm it is accepting requests
+    HEALTH_BARRIER_CMD="python3 $WS_PATH/sync.py barrier \
+        --node-ips ${NODE0_ADDR} \
+        --node-ports ${ROUTER_PORT} \
+        --wait-for-all-health \
+        --health-endpoint /health \
+        --timeout 1800"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $HEALTH_BARRIER_CMD"
+    else
+        eval "$HEALTH_BARRIER_CMD"
+        echo "MoRI-IO proxy is ready for benchmarking"
+    fi
+
+    echo "Ready for benchmarking on ${host_name}:${host_ip}"
+    echo "Benchmarking on ${host_name}:${host_ip}"
+    cd $WS_PATH
+
+    export ROUTER_PORT=$ROUTER_PORT
+    BENCH_CMD="bash $WS_PATH/bench.sh ${xP} ${yD} $((GPUS_PER_NODE*xP)) $((GPUS_PER_NODE*yD)) \
+        $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \
+        ${BENCH_OUTPUT_LEN} \"${BENCH_MAX_CONCURRENCY}\" ${BENCH_REQUEST_RATE} \
+        ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}"
+
+    if [[ "${EVAL_ONLY:-false}" == "true" ]]; then
+        echo "EVAL_ONLY mode: skipping throughput benchmark"
+    elif [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $BENCH_CMD"
+    else
+        set -x
+        eval "$BENCH_CMD"
+        set +x
+    fi
+
+    # Run evaluation if requested (before killing router)
+    if [[ "${RUN_EVAL:-false}" == "true" ]]; then
+        echo "Running lm-eval evaluation on Node 0..."
+
+        EVAL_HEALTH_OK=false
+        for _attempt in 1 2 3; do
+            if curl -sf --max-time 10 "http://0.0.0.0:${ROUTER_PORT}/health" >/dev/null 2>&1; then
+                EVAL_HEALTH_OK=true
+                break
+            fi
+            echo "Eval health check attempt $_attempt failed, retrying in 10s..."
+            sleep 10
+        done
+
+        if [[ "$EVAL_HEALTH_OK" != "true" ]]; then
+            echo "WARNING: Router health check failed after 3 attempts. Skipping eval."
+        else
+            pushd /workspace
+
+            source /workspace/benchmarks/benchmark_lib.sh
+
+            if [[ -n "${EVAL_CONC:-}" ]]; then
+                export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC}"
+            else
+                export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1)
+            fi
+
+            if [[ "$DRY_RUN" -eq 1 ]]; then
+                echo "DRY RUN: run_eval --framework lm-eval --port $ROUTER_PORT (conc=${EVAL_CONCURRENT_REQUESTS}, ctx=${EVAL_MAX_MODEL_LEN:-auto})"
+            else
+                run_eval --framework lm-eval --port "$ROUTER_PORT"
+                eval_rc=$?
+
+                if [[ $eval_rc -ne 0 ]]; then
+                    echo "ERROR: run_eval exited rc=$eval_rc; skipping metadata write and eval artifact staging" >&2
+                    EVAL_FAILED=1
+                else
+                    export TP="${PREFILL_TP_SIZE}"
+                    export CONC="${EVAL_CONCURRENT_REQUESTS}"
+                    export EP_SIZE=1
+                    [[ "${PREFILL_ENABLE_EP}" == "true" ]] && EP_SIZE="${PREFILL_TP_SIZE}"
+                    export PREFILL_TP="${PREFILL_TP_SIZE}"
+                    export PREFILL_EP=1
+                    [[ "${PREFILL_ENABLE_EP}" == "true" ]] && PREFILL_EP="${PREFILL_TP_SIZE}"
+                    export PREFILL_NUM_WORKERS="${xP}"
+                    export DECODE_TP="${DECODE_TP_SIZE}"
+                    export DECODE_EP=1
+                    [[ "${DECODE_ENABLE_EP}" == "true" ]] && DECODE_EP="${DECODE_TP_SIZE}"
+                    export DECODE_NUM_WORKERS="${yD}"
+                    export DP_ATTENTION="${PREFILL_ENABLE_DP}"
+                    export PREFILL_DP_ATTENTION="${PREFILL_ENABLE_DP}"
+                    export DECODE_DP_ATTENTION="${DECODE_ENABLE_DP}"
+                    export ISL="${BENCH_INPUT_LEN}"
+                    export OSL="${BENCH_OUTPUT_LEN}"
+
+                    append_lm_eval_summary
+
+                    EVAL_COPY_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/eval_results"
+                    mkdir -p "$EVAL_COPY_DIR"
+                    for f in meta_env.json; do
+                        [ -e "/workspace/$f" ] && cp -f "/workspace/$f" "$EVAL_COPY_DIR/"
+                    done
+                    find /workspace -maxdepth 1 -name 'results*.json' -exec cp -f {} "$EVAL_COPY_DIR/" \;
+                    find /workspace -maxdepth 1 -name 'sample*.jsonl' -exec cp -f {} "$EVAL_COPY_DIR/" \;
+
+                    echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR"
+                fi
+            fi
+
+            popd
+        fi
+    fi
+
+    # Copy benchmark/eval results to BENCHMARK_LOGS_DIR (mounted from host)
+    LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs"
+    mkdir -p "$LOGS_OUTPUT"
+
+    if [[ "$DRY_RUN" -eq 0 ]]; then
+        cp -r /run_logs/slurm_job-${SLURM_JOB_ID} "$LOGS_OUTPUT/"
+        echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}"
+    fi
+
+    echo "Killing the prefill server"
+    if [[ "$DRY_RUN" -eq 0 ]]; then
+        [[ -n "${prefill_pid:-}" ]] && kill $prefill_pid 2>/dev/null || true
+        sleep 2
+        pkill -f "vllm serve" 2>/dev/null || true
+    fi
+
+    if [[ "${EVAL_FAILED:-0}" -eq 1 ]]; then
+        echo "ERROR: eval failed; exiting node-0 with rc=1"
+        exit 1
+    fi
+
+elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then
+    echo "${host_name}:${host_ip} is Additional Prefill Node (Model: ${MODEL_NAME})"
+    echo "Using prefill config: $PREFILL_SERVER_CONFIG"
+
+    setup_vllm_env
+
+    SERVED_MODEL="${MODEL_NAME}"
+    PREFILL_CMD="vllm serve ${MODEL_PATH} \
+        --served-model-name ${SERVED_MODEL} \
+        --port $SERVER_PORT \
+        --trust-remote-code \
+        --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \
+        ${PREFILL_SERVER_CONFIG}"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $PREFILL_CMD"
+    else
+        PREFILL_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log"
+        set -x
+        eval "$PREFILL_CMD" > "$PREFILL_LOG_FILE" 2>&1 &
+        set +x
+        prefill_pid=$!
+    fi
+
+    echo "Waiting for proxy server to be up..."
+    BARRIER_CMD="python3 $WS_PATH/sync.py barrier \
+        --node-ips ${NODE0_ADDR} \
+        --node-ports ${ROUTER_PORT} \
+        --wait-for-all-ports \
+        --timeout 1800"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $BARRIER_CMD"
+    else
+        eval "$BARRIER_CMD"
+    fi
+
+    echo "Waiting until proxy server closes..."
+    WAIT_CMD="python3 $WS_PATH/sync.py wait \
+        --remote-ip ${NODE0_ADDR} \
+        --remote-port ${ROUTER_PORT}"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $WAIT_CMD"
+    else
+        eval "$WAIT_CMD"
+    fi
+
+    echo "Killing the prefill server"
+    [[ "$DRY_RUN" -eq 0 ]] && kill $prefill_pid 2>/dev/null || true
+
+else
+    echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME})"
+    echo "Using decode config: $DECODE_SERVER_CONFIG"
+
+    setup_vllm_env
+
+    for env_pair in ${DECODE_MODEL_ENVS}; do
+        export "$env_pair"
+        echo "[DECODE_ENV] $env_pair"
+    done
+
+    SERVED_MODEL="${MODEL_NAME}"
+    DECODE_CMD="vllm serve ${MODEL_PATH} \
+        --served-model-name ${SERVED_MODEL} \
+        --port $SERVER_PORT \
+        --trust-remote-code \
+        --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_consumer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \
+        ${DECODE_SERVER_CONFIG}"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $DECODE_CMD"
+    else
+        DECODE_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log"
+        set -x
+        eval "$DECODE_CMD" > "$DECODE_LOG_FILE" 2>&1 &
+        set +x
+        decode_pid=$!
+    fi
+
+    echo "Waiting for proxy server to be up..."
+    BARRIER_CMD="python3 $WS_PATH/sync.py barrier \
+        --node-ips ${NODE0_ADDR} \
+        --node-ports ${ROUTER_PORT} \
+        --wait-for-all-ports \
+        --timeout 1800"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $BARRIER_CMD"
+    else
+        eval "$BARRIER_CMD"
+    fi
+
+    echo "Waiting until proxy server closes..."
+    WAIT_CMD="python3 $WS_PATH/sync.py wait \
+        --remote-ip ${NODE0_ADDR} \
+        --remote-port ${ROUTER_PORT}"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $WAIT_CMD"
+    else
+        eval "$WAIT_CMD"
+    fi
+
+    echo "Killing the decode server"
+    [[ "$DRY_RUN" -eq 0 ]] && kill $decode_pid 2>/dev/null || true
+fi
+
+# echo "Killing the etcd server"
+# kill $etcd_pid 2>/dev/null || true
+# pkill -f etcd 2>/dev/null || true
+
+echo "Script completed successfully"
+exit 0
diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh
new file mode 100644
index 000000000..159b3e16b
--- /dev/null
+++ b/benchmarks/multi_node/amd_utils/setup_deps.sh
@@ -0,0 +1,253 @@
+#!/bin/bash
+# =============================================================================
+# setup_deps.sh — Install missing vLLM disagg dependencies at container start.
+#
+# Base image: vllm/vllm-openai-rocm:v0.18.0
+# Sourced by server.sh so PATH / LD_LIBRARY_PATH exports persist.
+# Idempotent: each component is skipped if already present.
+#
+# Build steps run in subshells to avoid CWD pollution between installers.
+# =============================================================================
+
+ROCM_PATH="${ROCM_PATH:-/opt/rocm}"
+UCX_HOME="${UCX_HOME:-/usr/local/ucx}"
+RIXL_HOME="${RIXL_HOME:-/usr/local/rixl}"
+
+_SETUP_START=$(date +%s)
+_SETUP_INSTALLED=()
+
+git_clone_retry() {
+    local url="$1" dest="$2" max_tries=3 try=1
+    while (( try <= max_tries )); do
+        if git clone --quiet "$url" "$dest" 2>/dev/null; then return 0; fi
+        echo "[SETUP] git clone attempt $try/$max_tries failed for $url, retrying in 10s..."
+        rm -rf "$dest"
+        sleep 10
+        (( try++ ))
+    done
+    echo "[SETUP] git clone failed after $max_tries attempts: $url"
+    return 1
+}
+
+# ---------------------------------------------------------------------------
+# 1. UCX (ROCm fork — required for GPU-direct RDMA via Nixl)
+# ---------------------------------------------------------------------------
+install_ucx() {
+    if [[ -x "${UCX_HOME}/bin/ucx_info" ]]; then
+        echo "[SETUP] UCX already present at ${UCX_HOME}"
+        return 0
+    fi
+
+    echo "[SETUP] Installing UCX build dependencies..."
+    apt-get update -q -y && apt-get install -q -y \
+        autoconf automake libtool pkg-config \
+        librdmacm-dev rdmacm-utils libibverbs-dev ibverbs-utils ibverbs-providers \
+        infiniband-diags perftest ethtool rdma-core strace \
+        && rm -rf /var/lib/apt/lists/*
+
+    echo "[SETUP] Building UCX from source (ROCm/ucx @ da3fac2a)..."
+    (
+        set -e
+        mkdir -p /usr/local/src && cd /usr/local/src
+        git_clone_retry https://github.com/ROCm/ucx.git ucx && cd ucx
+        git checkout da3fac2a
+        ./autogen.sh && mkdir -p build && cd build
+        ../configure \
+            --prefix="${UCX_HOME}" \
+            --enable-shared --disable-static \
+            --disable-doxygen-doc --enable-optimizations \
+            --enable-devel-headers --enable-mt \
+            --with-rocm="${ROCM_PATH}" --with-verbs --with-dm
+        make -j"$(nproc)" && make install
+    )
+    rm -rf /usr/local/src/ucx
+
+    if [[ ! -x "${UCX_HOME}/bin/ucx_info" ]]; then
+        echo "[SETUP] ERROR: UCX build failed"; exit 1
+    fi
+    _SETUP_INSTALLED+=("UCX")
+}
+
+# ---------------------------------------------------------------------------
+# 2. RIXL (ROCm fork of NIXL — KV cache transfer for disaggregated vLLM)
+# ---------------------------------------------------------------------------
+install_rixl() {
+    if python3 -c "import rixl" 2>/dev/null; then
+        echo "[SETUP] RIXL Python bindings already present"
+        return 0
+    fi
+
+    echo "[SETUP] Installing RIXL build dependencies..."
+    apt-get update -q -y && apt-get install -q -y \
+        libgrpc-dev libgrpc++-dev libprotobuf-dev protobuf-compiler-grpc \
+        libcpprest-dev libaio-dev \
+        && rm -rf /var/lib/apt/lists/*
+    pip3 install --quiet meson "pybind11[global]"
+
+    echo "[SETUP] Building RIXL from source (ROCm/RIXL @ f33a5599)..."
+    (
+        set -e
+        git_clone_retry https://github.com/ROCm/RIXL.git /opt/rixl && cd /opt/rixl
+        git checkout f33a5599
+        meson setup build --prefix="${RIXL_HOME}" \
+            -Ducx_path="${UCX_HOME}" \
+            -Drocm_path="${ROCM_PATH}"
+        cd build && ninja && ninja install
+        cd /opt/rixl
+        pip install --quiet \
+            --config-settings=setup-args="-Drocm_path=${ROCM_PATH}" \
+            --config-settings=setup-args="-Ducx_path=${UCX_HOME}" .
+    )
+    rm -rf /opt/rixl
+
+    if ! python3 -c "import rixl" 2>/dev/null; then
+        echo "[SETUP] ERROR: RIXL build failed"; exit 1
+    fi
+    _SETUP_INSTALLED+=("RIXL")
+}
+
+# ---------------------------------------------------------------------------
+# 3. etcd (distributed KV store for vLLM disagg service discovery)
+# ---------------------------------------------------------------------------
+install_etcd() {
+    if [[ -x /usr/local/bin/etcd/etcd ]]; then
+        echo "[SETUP] etcd already present"
+        return 0
+    fi
+
+    local version="v3.6.0-rc.5"
+    echo "[SETUP] Downloading etcd ${version}..."
+    wget -q "https://github.com/etcd-io/etcd/releases/download/${version}/etcd-${version}-linux-amd64.tar.gz" \
+        -O /tmp/etcd.tar.gz
+    mkdir -p /usr/local/bin/etcd
+    tar -xf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1
+    rm /tmp/etcd.tar.gz
+    _SETUP_INSTALLED+=("etcd")
+}
+
+# ---------------------------------------------------------------------------
+# 4. libionic1 (Pensando ionic RDMA verbs provider for RoCEv2 KV transfer)
+#    Harmless on non-Pensando nodes (shared lib is simply unused).
+# ---------------------------------------------------------------------------
+install_libionic() {
+    if dpkg -l libionic1 2>/dev/null | grep -q '^ii'; then
+        echo "[SETUP] libionic1 already installed"
+        return 0
+    fi
+
+    echo "[SETUP] Downloading and installing libionic1..."
+    wget -q "https://repo.radeon.com/amdainic/pensando/ubuntu/1.117.5/pool/main/r/rdma-core/libionic1_54.0-149.g3304be71_amd64.deb" \
+        -O /tmp/libionic1.deb
+    dpkg -i /tmp/libionic1.deb || true
+    rm -f /tmp/libionic1.deb
+    _SETUP_INSTALLED+=("libionic1")
+}
+
+# ---------------------------------------------------------------------------
+# 5. Container RDMA/net tools
+#    - ibv_devinfo comes from ibverbs-utils
+#    - iproute2 provides the `ip` command
+#    Used for in-container NIC/RDMA validation and routing checks.
+# ---------------------------------------------------------------------------
+install_recipe_deps() {
+    if command -v ibv_devinfo >/dev/null 2>&1 && command -v ip >/dev/null 2>&1; then
+        echo "[SETUP] Container RDMA/net tools already present"
+        return 0
+    fi
+
+    echo "[SETUP] Installing ibv_devinfo + iproute2 in container..."
+    apt-get update -q -y && apt-get install -q -y \
+        ibverbs-utils iproute2 \
+        && rm -rf /var/lib/apt/lists/*
+
+    if ! command -v ibv_devinfo >/dev/null 2>&1 || ! command -v ip >/dev/null 2>&1; then
+        echo "[SETUP] ERROR: Failed to install ibv_devinfo/iproute2"; exit 1
+    fi
+    _SETUP_INSTALLED+=("ibverbs-utils+iproute2")
+}
+
+# ---------------------------------------------------------------------------
+# 6. MoRI (Modular RDMA Interface — EP dispatch/combine kernels for MoE)
+#    Required for --all2all-backend mori (Expert Parallelism via RDMA).
+#    GPU kernels are JIT-compiled on first use; no hipcc needed at install.
+#
+#    v0.18.0 ships MoRI 0.1.dev185+g2d02c6a98, but it STILL has the PCI
+#    topology bug (TopoSystemPci::Load assertion failure on Broadcom
+#    PEX890xx switches).  Always rebuild from our target commit b645fc8
+#    which includes the dsp2dev subordinate-range fix.
+# ---------------------------------------------------------------------------
+install_mori() {
+    local MORI_TARGET_COMMIT="b645fc8"
+    local MORI_MARKER="/usr/local/lib/python3.*/dist-packages/.mori_commit_${MORI_TARGET_COMMIT}"
+
+    if ls $MORI_MARKER &>/dev/null; then
+        echo "[SETUP] MoRI @ $MORI_TARGET_COMMIT already installed (marker found)"
+        return 0
+    fi
+
+    echo "[SETUP] Installing MoRI build dependencies..."
+    apt-get update -q -y && apt-get install -q -y \
+        libopenmpi-dev openmpi-bin libpci-dev \
+        && rm -rf /var/lib/apt/lists/*
+
+    echo "[SETUP] Building MoRI from source (ROCm/mori @ $MORI_TARGET_COMMIT)..."
+    echo "[SETUP]   (overriding image-provided version to fix PCI topology bug)"
+    (
+        set -e
+        git_clone_retry https://github.com/ROCm/mori.git /opt/mori && cd /opt/mori
+        git checkout "$MORI_TARGET_COMMIT"
+        pip install --quiet --force-reinstall .
+    )
+    rm -rf /opt/mori
+
+    if ! python3 -c "import mori" 2>/dev/null; then
+        echo "[SETUP] ERROR: MoRI build failed"; exit 1
+    fi
+    touch $(python3 -c "import sysconfig; print(sysconfig.get_paths()['purelib'])")/.mori_commit_${MORI_TARGET_COMMIT}
+    _SETUP_INSTALLED+=("MoRI@$MORI_TARGET_COMMIT")
+}
+
+# ---------------------------------------------------------------------------
+# 6b. amd-quark (MXFP4 quantization support for Kimi-K2.5-MXFP4 and similar)
+#     Required due to ROCm vLLM missing the quark dependency:
+#     https://github.com/vllm-project/vllm/issues/35633
+# ---------------------------------------------------------------------------
+install_amd_quark() {
+    if python3 -c "import quark" 2>/dev/null; then
+        echo "[SETUP] amd-quark already present"
+        return 0
+    fi
+
+    echo "[SETUP] Installing amd-quark for MXFP4 quantization support..."
+    pip install --quiet amd-quark
+
+    if ! python3 -c "import quark" 2>/dev/null; then
+        echo "[SETUP] WARN: amd-quark install failed (non-fatal for non-MXFP4 models)"
+        return 0
+    fi
+    _SETUP_INSTALLED+=("amd-quark")
+}
+
+# =============================================================================
+# Run installers
+# =============================================================================
+
+install_recipe_deps
+install_amd_quark
+
+# =============================================================================
+# Export paths (persists for server.sh since this file is sourced)
+# =============================================================================
+
+export ROCM_PATH="${ROCM_PATH}"
+export UCX_HOME="${UCX_HOME}"
+export RIXL_HOME="${RIXL_HOME}"
+export PATH="${UCX_HOME}/bin:/usr/local/bin/etcd:/root/.cargo/bin:${PATH}"
+export LD_LIBRARY_PATH="${UCX_HOME}/lib:${RIXL_HOME}/lib:${RIXL_HOME}/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}"
+
+_SETUP_END=$(date +%s)
+if [[ ${#_SETUP_INSTALLED[@]} -eq 0 ]]; then
+    echo "[SETUP] All dependencies already present (${_SETUP_END}s wallclock)"
+else
+    echo "[SETUP] Installed: ${_SETUP_INSTALLED[*]} in $(( _SETUP_END - _SETUP_START ))s"
+fi
diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh
index d2c49bc9e..fa3d65418 100755
--- a/benchmarks/multi_node/amd_utils/submit.sh
+++ b/benchmarks/multi_node/amd_utils/submit.sh
@@ -2,37 +2,51 @@
 #
 # Cluster Configuration Template for Multi-Node Disaggregated Serving
 #
-# This script submits a multi-node SGLang disaggregated benchmark job to SLURM.
+# This script submits a multi-node disaggregated benchmark job to SLURM.
 # It must be configured for your specific cluster before use.
+#
+# ENGINE=sglang (default): SGLang disaggregated serving
+# ENGINE=vllm:             vLLM disaggregated serving
+#
+# Router is co-located with the first prefill node (same for both engines),
+# so NUM_NODES = PREFILL_NODES + DECODE_NODES.
 
 usage() {
     cat << 'USAGE'
-This script aims to provide a one-liner call to the submit_job_script.py,
-so that the deployment process can be further simplified.
-
-To use this script, fill in the following script and run it under your `slurm_jobs` directory:
-======== begin script area ========
-# REQUIRED: Cluster-specific configuration
-export SLURM_ACCOUNT=              # Your SLURM account name
-export SLURM_PARTITION=            # SLURM partition to submit to
-export TIME_LIMIT=                 # Job time limit (e.g., "08:00:00")
-
-# REQUIRED: Model and container paths
-export MODEL_PATH=                 # Path to model directory (e.g., /mnt/models, /nfsdata)
-export CONTAINER_IMAGE=            # Path to container squash file
-
-# REQUIRED: Hardware configuration
-export GPUS_PER_NODE=              # GPUs per node (e.g., 8 for MI355X, 4 for MI325X)
-
-# OPTIONAL: RDMA/Network configuration (set in runners/launch_mi355x-amds.sh for AMD)
-# export IBDEVICES=                # RDMA device names (e.g., ionic_0,ionic_1,... or mlx5_0,mlx5_1,...)
-# export MORI_RDMA_TC=             # RDMA traffic class (e.g., 96, 104)
-
-bash submit.sh \
-$PREFILL_NODES $PREFILL_WORKERS $DECODE_NODES $DECODE_WORKERS \
-$ADDITIONAL_FRONTENDS \
-$ISL $OSL $CONCURRENCIES $REQUEST_RATE
-======== end script area ========
+Usage:
+  bash submit.sh <PREFILL_NODES> <PREFILL_WORKERS> <DECODE_NODES> <DECODE_WORKERS> \
+                 <ISL> <OSL> <CONCURRENCIES> <REQUEST_RATE> \
+                 <PREFILL_ENABLE_EP> <PREFILL_ENABLE_DP> \
+                 <DECODE_ENABLE_EP> <DECODE_ENABLE_DP> \
+                 <PREFILL_TP> <DECODE_TP> \
+                 <RANDOM_RANGE_RATIO> [NODE_LIST]
+
+Arguments:
+  PREFILL_NODES        Number of prefill nodes
+  PREFILL_WORKERS      Number of prefill workers (usually 1)
+  DECODE_NODES         Number of decode nodes
+  DECODE_WORKERS       Number of decode workers (usually 1)
+  ISL                  Input sequence length
+  OSL                  Output sequence length
+  CONCURRENCIES        Concurrency levels, delimited by 'x' (e.g., "8x16x32")
+  REQUEST_RATE         Request rate ("inf" for max throughput)
+  PREFILL_ENABLE_EP    true/false or 1/0 (expert parallelism on prefill)
+  PREFILL_ENABLE_DP    true/false or 1/0 (data-parallel attention on prefill)
+  DECODE_ENABLE_EP     true/false or 1/0 (expert parallelism on decode)
+  DECODE_ENABLE_DP     true/false or 1/0 (data-parallel attention on decode)
+  PREFILL_TP           Tensor parallel size per prefill node
+  DECODE_TP            Tensor parallel size per decode node
+  RANDOM_RANGE_RATIO   Random range ratio for benchmark client
+  NODE_LIST            Optional: comma-separated hostnames (must match NUM_NODES)
+
+Required environment variables:
+  SLURM_ACCOUNT    SLURM account name
+  SLURM_PARTITION  SLURM partition
+  TIME_LIMIT       Job time limit (e.g., "08:00:00")
+  MODEL_PATH       Path to model directory (e.g., /nfsdata)
+  MODEL_NAME       Model name directory
+  CONTAINER_IMAGE  Docker image name (e.g., vllm_disagg_pd:latest)
+  RUNNER_NAME      Runner identifier (for job name)
 USAGE
 }
 
@@ -53,6 +67,7 @@ check_env MODEL_PATH
 check_env MODEL_NAME
 check_env CONTAINER_IMAGE
 check_env RUNNER_NAME
+check_env FRAMEWORK
 
 # GPUS_PER_NODE defaults to 8 (MI355X). Set to 4 for MI325X if needed.
 GPUS_PER_NODE="${GPUS_PER_NODE:-8}"
@@ -66,31 +81,32 @@ ISL=$5
 OSL=$6
 CONCURRENCIES=$7
 REQUEST_RATE=$8
-PREFILL_ENABLE_EP=${9:-1}
-PREFILL_ENABLE_DP=${10:-1}
-DECODE_ENABLE_EP=${11:-1}
-DECODE_ENABLE_DP=${12:-1}
+PREFILL_ENABLE_EP=${9:-true}
+PREFILL_ENABLE_DP=${10:-true}
+DECODE_ENABLE_EP=${11:-true}
+DECODE_ENABLE_DP=${12:-true}
 PREFILL_TP=${13:-8}
 DECODE_TP=${14:-8}
-RANDOM_RANGE_RATIO=${15}
+RANDOM_RANGE_RATIO=${15:-0.8}
 NODE_LIST=${16}
 
-
 NUM_NODES=$((PREFILL_NODES + DECODE_NODES))
 profiler_args="${ISL} ${OSL} ${CONCURRENCIES} ${REQUEST_RATE}"
 
 # Export variables for the SLURM job
+export ENGINE="${FRAMEWORK:-sglang}"
 export MODEL_DIR=$MODEL_PATH
 export DOCKER_IMAGE_NAME=$CONTAINER_IMAGE
 export PROFILER_ARGS=$profiler_args
 
-
-
+# Engine-specific xP/yD semantics and TP exports
+if [[ "$ENGINE" == "vllm-disagg" ]]; then
+    export PROXY_STREAM_IDLE_TIMEOUT=${PROXY_STREAM_IDLE_TIMEOUT:-300}
+    export VLLM_MORIIO_CONNECTOR_READ_MODE=${VLLM_MORIIO_CONNECTOR_READ_MODE:-1}
+fi
+# xP = prefill workers, yD = decode workers (may span multiple nodes)
 export xP=$PREFILL_WORKERS
 export yD=$DECODE_WORKERS
-export NUM_NODES=$NUM_NODES
-export GPUS_PER_NODE=$GPUS_PER_NODE
-export MODEL_NAME=$MODEL_NAME
 export PREFILL_TP_SIZE=$(( $PREFILL_NODES * $PREFILL_TP / $PREFILL_WORKERS ))
 export PREFILL_ENABLE_EP=${PREFILL_ENABLE_EP}
 export PREFILL_ENABLE_DP=${PREFILL_ENABLE_DP}
@@ -98,12 +114,16 @@ export DECODE_TP_SIZE=$(( $DECODE_NODES * $DECODE_TP / $DECODE_WORKERS ))
 export DECODE_ENABLE_EP=${DECODE_ENABLE_EP}
 export DECODE_ENABLE_DP=${DECODE_ENABLE_DP}
 export DECODE_MTP_SIZE=${DECODE_MTP_SIZE}
+
+export NUM_NODES=$NUM_NODES
+export GPUS_PER_NODE=$GPUS_PER_NODE
+export MODEL_NAME=$MODEL_NAME
 export BENCH_INPUT_LEN=${ISL}
 export BENCH_OUTPUT_LEN=${OSL}
-export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO}
-export BENCH_NUM_PROMPTS_MULTIPLIER=10
+export BENCH_NUM_PROMPTS_MULTIPLIER=${BENCH_NUM_PROMPTS_MULTIPLIER:-10}
 export BENCH_MAX_CONCURRENCY=${CONCURRENCIES}
 export BENCH_REQUEST_RATE=${REQUEST_RATE}
+export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO:-0.8}
 
 # Eval-related env vars (threaded from workflow → runner → here → job.slurm → Docker)
 export RUN_EVAL="${RUN_EVAL:-false}"
@@ -118,13 +138,10 @@ export SPEC_DECODING="${SPEC_DECODING:-}"
 export IS_MULTINODE="${IS_MULTINODE:-false}"
 
 # Log directory: must be on NFS (shared filesystem) so the submit host can read SLURM output.
-# SLURM writes output files on the batch node, so /tmp won't work (node-local).
-# Defaults to a sibling directory of the submit working directory.
 export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}"
 mkdir -p "$BENCHMARK_LOGS_DIR"
 
 # Optional: pass an explicit node list to sbatch.
-# NODE_LIST is expected to be comma-separated hostnames.
 NODELIST_OPT=()
 if [[ -n "${NODE_LIST//[[:space:]]/}" ]]; then
     IFS=',' read -r -a NODE_ARR <<< "$NODE_LIST"
@@ -137,6 +154,63 @@ if [[ -n "${NODE_LIST//[[:space:]]/}" ]]; then
     NODELIST_OPT=(--nodelist "$NODELIST_CSV")
 fi
 
+# Optional: exclude specific nodes (e.g. nodes with broken Docker sockets).
+# Set SLURM_EXCLUDE_NODES env var to a comma-separated list of hostnames.
+EXCLUDE_OPT=()
+SLURM_EXCLUDE_NODES="${SLURM_EXCLUDE_NODES:-mia1-p01-g11,mia1-p01-g12,mia1-p01-g15}"
+if [[ -n "${SLURM_EXCLUDE_NODES:-}" ]]; then
+    EXCLUDE_OPT=(--exclude "$SLURM_EXCLUDE_NODES")
+fi
+
+# =============================================================================
+# Reuse existing allocation (skip sbatch)
+# =============================================================================
+# When SLURM_REUSE_JOBID is set, run job.slurm directly in the current shell,
+# attaching to the existing allocation. Inner `srun` calls pick up the
+# allocation via SLURM_JOB_ID; SLURM_OVERLAP=1 lets them share task slots with
+# the interactive shell already holding the allocation.
+if [[ -n "${SLURM_REUSE_JOBID:-}" ]]; then
+    REUSE_JID="$SLURM_REUSE_JOBID"
+    echo "Reusing existing Slurm allocation ${REUSE_JID} (skipping sbatch)" >&2
+
+    # Resolve allocation's nodelist if not already provided.
+    ALLOC_NODELIST="${SLURM_JOB_NODELIST:-$(squeue -h -j "$REUSE_JID" -o '%N' 2>/dev/null)}"
+    if [[ -z "$ALLOC_NODELIST" ]]; then
+        echo "Error: could not resolve nodelist for job ${REUSE_JID}" >&2
+        exit 1
+    fi
+    ALLOC_NNODES=$(scontrol show hostnames "$ALLOC_NODELIST" | wc -l)
+    if [[ "$ALLOC_NNODES" -lt "$NUM_NODES" ]]; then
+        echo "Error: allocation ${REUSE_JID} has ${ALLOC_NNODES} nodes, need ${NUM_NODES}" >&2
+        exit 1
+    fi
+
+    export SLURM_JOB_ID="$REUSE_JID"
+    export SLURM_JOBID="$REUSE_JID"
+    export SLURM_JOB_NODELIST="$ALLOC_NODELIST"
+    export SLURM_NODELIST="$ALLOC_NODELIST"
+    export SLURM_NNODES="$ALLOC_NNODES"
+    export SLURM_JOB_NUM_NODES="$ALLOC_NNODES"
+    export SLURM_NTASKS="$ALLOC_NNODES"
+    export SLURM_NPROCS="$ALLOC_NNODES"
+    export SLURM_NTASKS_PER_NODE=1
+    export SLURM_TASKS_PER_NODE="1(x${ALLOC_NNODES})"
+    export SLURM_OVERLAP=1
+    export SLURM_SUBMIT_DIR="$(pwd)"
+
+    STDOUT_LOG="${BENCHMARK_LOGS_DIR}/slurm_job-${REUSE_JID}.out"
+    STDERR_LOG="${BENCHMARK_LOGS_DIR}/slurm_job-${REUSE_JID}.err"
+    rm -f "$STDOUT_LOG" "$STDERR_LOG"
+
+    nohup bash "$(dirname "$0")/job.slurm" >"$STDOUT_LOG" 2>"$STDERR_LOG" &
+    INLINE_PID=$!
+    echo "$INLINE_PID" > "${BENCHMARK_LOGS_DIR}/slurm_job-${REUSE_JID}.pid"
+    echo "Started job.slurm (pid=${INLINE_PID}); logs: ${STDOUT_LOG}" >&2
+
+    echo "$REUSE_JID"
+    exit 0
+fi
+
 # Construct the sbatch command
 sbatch_cmd=(
     sbatch
@@ -145,6 +219,7 @@ sbatch_cmd=(
     -N "$NUM_NODES"
     -n "$NUM_NODES"
     "${NODELIST_OPT[@]}"
+    "${EXCLUDE_OPT[@]}"
     --time "$TIME_LIMIT"
     --partition "$SLURM_PARTITION"
     --account "$SLURM_ACCOUNT"
@@ -154,7 +229,6 @@ sbatch_cmd=(
     "$(dirname "$0")/job.slurm"
 )
 
-# todo: --parsable outputs only the jobid and cluster name, test if jobid;clustername is correct
 JOB_ID=$("${sbatch_cmd[@]}")
 if [[ $? -ne 0 ]]; then
     echo "Error: Failed to submit job with sbatch" >&2
diff --git a/benchmarks/multi_node/amd_utils/sync.py b/benchmarks/multi_node/amd_utils/sync.py
index 140951519..3678e7614 100755
--- a/benchmarks/multi_node/amd_utils/sync.py
+++ b/benchmarks/multi_node/amd_utils/sync.py
@@ -143,7 +143,10 @@ def close_port():
             time.sleep(30)
 
     if args.enable_port:
-        time.sleep(30)
+        # Keep the port open long enough for slow nodes to pass their barrier.
+        # The previous 30s was too short when setup times vary by minutes.
+        grace = max(60, args.timeout // 2) if args.timeout > 0 else 300
+        time.sleep(grace)
         close_port()
 
 
diff --git a/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh
index 6a7314ab4..d17d1a323 100644
--- a/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh
+++ b/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh
@@ -19,7 +19,8 @@ check_env_vars \
     DECODE_DP_ATTN \
     PREFILL_NODES \
     DECODE_NODES \
-    RANDOM_RANGE_RATIO
+    RANDOM_RANGE_RATIO \
+    FRAMEWORK
 
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
diff --git a/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh
index 0124d4b4d..a8c0d2743 100644
--- a/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh
+++ b/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh
@@ -19,7 +19,8 @@ check_env_vars \
     DECODE_DP_ATTN \
     PREFILL_NODES \
     DECODE_NODES \
-    RANDOM_RANGE_RATIO
+    RANDOM_RANGE_RATIO \
+    FRAMEWORK
 
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
diff --git a/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh b/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh
new file mode 100755
index 000000000..d7995fb25
--- /dev/null
+++ b/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh
@@ -0,0 +1,80 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    CONC_LIST \
+    ISL \
+    OSL \
+    IMAGE \
+    SPEC_DECODING \
+    MODEL_PATH \
+    PREFILL_NUM_WORKERS \
+    PREFILL_TP \
+    PREFILL_EP \
+    PREFILL_DP_ATTN \
+    DECODE_NUM_WORKERS \
+    DECODE_TP \
+    DECODE_EP \
+    DECODE_DP_ATTN \
+    PREFILL_NODES \
+    DECODE_NODES \
+    RANDOM_RANGE_RATIO \
+    FRAMEWORK
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+set -x
+
+cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1
+
+export TIME_LIMIT="08:00:00"
+export MODEL_PATH=$MODEL_PATH
+export MODEL_NAME=$MODEL_NAME
+export CONTAINER_IMAGE=$IMAGE
+
+# Same EP/DP booleans as dsr1_fp8_mi355x_sglang-disagg.sh → amd_utils/submit.sh
+if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
+    export PREFILL_ENABLE_EP=false
+else
+    export PREFILL_ENABLE_EP=true
+fi
+
+if [[ "$PREFILL_DP_ATTN" == "true" ]]; then
+    export PREFILL_ENABLE_DP=true
+else
+    export PREFILL_ENABLE_DP=false
+fi
+
+if [[ "${DECODE_EP:-1}" -eq 1 ]]; then
+    export DECODE_ENABLE_EP=false
+else
+    export DECODE_ENABLE_EP=true
+fi
+
+if [[ "$DECODE_DP_ATTN" == "true" ]]; then
+    export DECODE_ENABLE_DP=true
+else
+    export DECODE_ENABLE_DP=false
+fi
+
+# Parameter order matches SGLang disagg submit.sh; arg 16 is optional NODELIST.
+JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
+    $PREFILL_NUM_WORKERS \
+    $DECODE_NODES \
+    $DECODE_NUM_WORKERS \
+    $ISL $OSL "${CONC_LIST// /x}" inf \
+    ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
+    ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
+    ${PREFILL_TP} ${DECODE_TP} \
+    ${RANDOM_RANGE_RATIO} \
+    "${NODELIST:-}")
+
+if [[ $? -ne 0 ]]; then
+    echo "Failed to submit job" >&2
+    exit 1
+fi
+
+echo "$JOB_ID"
diff --git a/benchmarks/multi_node/minimaxm2.5_fp8_mi300x_vllm-disagg.sh b/benchmarks/multi_node/minimaxm2.5_fp8_mi300x_vllm-disagg.sh
new file mode 100644
index 000000000..a9a28d889
--- /dev/null
+++ b/benchmarks/multi_node/minimaxm2.5_fp8_mi300x_vllm-disagg.sh
@@ -0,0 +1,78 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    CONC_LIST \
+    ISL \
+    OSL \
+    IMAGE \
+    SPEC_DECODING \
+    MODEL_PATH \
+    PREFILL_NUM_WORKERS \
+    PREFILL_TP \
+    PREFILL_EP \
+    PREFILL_DP_ATTN \
+    DECODE_NUM_WORKERS \
+    DECODE_TP \
+    DECODE_EP \
+    DECODE_DP_ATTN \
+    PREFILL_NODES \
+    DECODE_NODES \
+    RANDOM_RANGE_RATIO \
+    FRAMEWORK
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+set -x
+
+cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1
+
+export TIME_LIMIT="08:00:00"
+export MODEL_PATH=$MODEL_PATH
+export MODEL_NAME=$MODEL_NAME
+export CONTAINER_IMAGE=$IMAGE
+
+if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
+    export PREFILL_ENABLE_EP=false
+else
+    export PREFILL_ENABLE_EP=true
+fi
+
+if [[ "$PREFILL_DP_ATTN" == "true" ]]; then
+    export PREFILL_ENABLE_DP=true
+else
+    export PREFILL_ENABLE_DP=false
+fi
+
+if [[ "${DECODE_EP:-1}" -eq 1 ]]; then
+    export DECODE_ENABLE_EP=false
+else
+    export DECODE_ENABLE_EP=true
+fi
+
+if [[ "$DECODE_DP_ATTN" == "true" ]]; then
+    export DECODE_ENABLE_DP=true
+else
+    export DECODE_ENABLE_DP=false
+fi
+
+JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
+    $PREFILL_NUM_WORKERS \
+    $DECODE_NODES \
+    $DECODE_NUM_WORKERS \
+    $ISL $OSL "${CONC_LIST// /x}" inf \
+    ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
+    ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
+    ${PREFILL_TP} ${DECODE_TP} \
+    ${RANDOM_RANGE_RATIO} \
+    "${NODELIST:-}")
+
+if [[ $? -ne 0 ]]; then
+    echo "Failed to submit job" >&2
+    exit 1
+fi
+
+echo "$JOB_ID"
diff --git a/benchmarks/multi_node/minimaxm2.5_fp8_mi325x_vllm-disagg.sh b/benchmarks/multi_node/minimaxm2.5_fp8_mi325x_vllm-disagg.sh
new file mode 100644
index 000000000..a9a28d889
--- /dev/null
+++ b/benchmarks/multi_node/minimaxm2.5_fp8_mi325x_vllm-disagg.sh
@@ -0,0 +1,78 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    CONC_LIST \
+    ISL \
+    OSL \
+    IMAGE \
+    SPEC_DECODING \
+    MODEL_PATH \
+    PREFILL_NUM_WORKERS \
+    PREFILL_TP \
+    PREFILL_EP \
+    PREFILL_DP_ATTN \
+    DECODE_NUM_WORKERS \
+    DECODE_TP \
+    DECODE_EP \
+    DECODE_DP_ATTN \
+    PREFILL_NODES \
+    DECODE_NODES \
+    RANDOM_RANGE_RATIO \
+    FRAMEWORK
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+set -x
+
+cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1
+
+export TIME_LIMIT="08:00:00"
+export MODEL_PATH=$MODEL_PATH
+export MODEL_NAME=$MODEL_NAME
+export CONTAINER_IMAGE=$IMAGE
+
+if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
+    export PREFILL_ENABLE_EP=false
+else
+    export PREFILL_ENABLE_EP=true
+fi
+
+if [[ "$PREFILL_DP_ATTN" == "true" ]]; then
+    export PREFILL_ENABLE_DP=true
+else
+    export PREFILL_ENABLE_DP=false
+fi
+
+if [[ "${DECODE_EP:-1}" -eq 1 ]]; then
+    export DECODE_ENABLE_EP=false
+else
+    export DECODE_ENABLE_EP=true
+fi
+
+if [[ "$DECODE_DP_ATTN" == "true" ]]; then
+    export DECODE_ENABLE_DP=true
+else
+    export DECODE_ENABLE_DP=false
+fi
+
+JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
+    $PREFILL_NUM_WORKERS \
+    $DECODE_NODES \
+    $DECODE_NUM_WORKERS \
+    $ISL $OSL "${CONC_LIST// /x}" inf \
+    ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
+    ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
+    ${PREFILL_TP} ${DECODE_TP} \
+    ${RANDOM_RANGE_RATIO} \
+    "${NODELIST:-}")
+
+if [[ $? -ne 0 ]]; then
+    echo "Failed to submit job" >&2
+    exit 1
+fi
+
+echo "$JOB_ID"
diff --git a/benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh b/benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh
new file mode 100644
index 000000000..a9a28d889
--- /dev/null
+++ b/benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh
@@ -0,0 +1,78 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    CONC_LIST \
+    ISL \
+    OSL \
+    IMAGE \
+    SPEC_DECODING \
+    MODEL_PATH \
+    PREFILL_NUM_WORKERS \
+    PREFILL_TP \
+    PREFILL_EP \
+    PREFILL_DP_ATTN \
+    DECODE_NUM_WORKERS \
+    DECODE_TP \
+    DECODE_EP \
+    DECODE_DP_ATTN \
+    PREFILL_NODES \
+    DECODE_NODES \
+    RANDOM_RANGE_RATIO \
+    FRAMEWORK
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+set -x
+
+cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1
+
+export TIME_LIMIT="08:00:00"
+export MODEL_PATH=$MODEL_PATH
+export MODEL_NAME=$MODEL_NAME
+export CONTAINER_IMAGE=$IMAGE
+
+if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
+    export PREFILL_ENABLE_EP=false
+else
+    export PREFILL_ENABLE_EP=true
+fi
+
+if [[ "$PREFILL_DP_ATTN" == "true" ]]; then
+    export PREFILL_ENABLE_DP=true
+else
+    export PREFILL_ENABLE_DP=false
+fi
+
+if [[ "${DECODE_EP:-1}" -eq 1 ]]; then
+    export DECODE_ENABLE_EP=false
+else
+    export DECODE_ENABLE_EP=true
+fi
+
+if [[ "$DECODE_DP_ATTN" == "true" ]]; then
+    export DECODE_ENABLE_DP=true
+else
+    export DECODE_ENABLE_DP=false
+fi
+
+JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
+    $PREFILL_NUM_WORKERS \
+    $DECODE_NODES \
+    $DECODE_NUM_WORKERS \
+    $ISL $OSL "${CONC_LIST// /x}" inf \
+    ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
+    ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
+    ${PREFILL_TP} ${DECODE_TP} \
+    ${RANDOM_RANGE_RATIO} \
+    "${NODELIST:-}")
+
+if [[ $? -ne 0 ]]; then
+    echo "Failed to submit job" >&2
+    exit 1
+fi
+
+echo "$JOB_ID"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 9c4910b13..28c33c497 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2934,3 +2934,23 @@
   description:
     - "Update SGLang ROCm image from v0.5.11/v0.5.10rc0 to v0.5.12-rocm720-mi35x-20260517"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1440
+
+- config-keys:
+    - kimik2.5-fp4-mi355x-vllm-disagg
+  description:
+    - "Add vLLM disaggregated prefill-decode benchmark for Kimi-K2.5-MXFP4 on MI355X"
+
+- config-keys:
+    - minimaxm2.5-fp8-mi355x-vllm-disagg
+  description:
+    - "Add vLLM disaggregated prefill-decode benchmark for MiniMax-M2.5 on MI355X"
+
+- config-keys:
+    - minimaxm2.5-fp8-mi300x-vllm-disagg
+  description:
+    - "Add vLLM disaggregated prefill-decode benchmark for MiniMax-M2.5 on MI300X"
+
+- config-keys:
+    - minimaxm2.5-fp8-mi325x-vllm-disagg
+  description:
+    - "Add vLLM disaggregated prefill-decode benchmark for MiniMax-M2.5 on MI325X"
diff --git a/runners/launch_mi300x-amds.sh b/runners/launch_mi300x-amds.sh
index 4f085d0ad..ee5aa4b94 100644
--- a/runners/launch_mi300x-amds.sh
+++ b/runners/launch_mi300x-amds.sh
@@ -1,43 +1,497 @@
 #!/usr/bin/env bash
 
-export HF_HUB_CACHE_MOUNT="/raid/hf-hub-cache/"
-export PORT=8888
+scancel_sync() {
+    local jobid=$1
+    local timeout=${2:-600}
+    local interval=10
+    local start
+    start=$(date +%s)
 
-PARTITION="compute"
-SQUASH_FILE="/home/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
-LOCK_FILE="${SQUASH_FILE}.lock"
+    echo "[scancel_sync] Requesting cancel of job $jobid"
+    scancel "$jobid" || true
 
-set -x
+    while [[ -n "$(squeue -j "$jobid" --noheader 2>/dev/null)" ]]; do
+        local now
+        now=$(date +%s)
+        if (( now - start >= timeout )); then
+            echo "[scancel_sync][WARN] job $jobid still present after ${timeout}s"
+            return 1
+        fi
+        echo "[scancel_sync] waiting for job $jobid to exit. $((timeout-(now-start))) secs remaining..."
+        sleep "$interval"
+    done
+    echo "[scancel_sync] job $jobid exited"
+    return 0
+}
 
 # Pin to the known-good mi300x nodes; others are unavailable:
-#   chi-mi300x-033, chi-mi300x-037: down (Not responding)
-#   chi-mi300x-049:                  drained (persistent /nvme_home disk-full)
-JOB_ID=$(salloc --partition=$PARTITION --nodelist=chi-mi300x-[034-036,054,057-058].ord.vultr.cpe.ice.amd.com --gres=gpu:$TP --cpus-per-task=256 --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+')
+#   chi-mi300x-033,037: down*
+#   chi-mi300x-049:     down 
+export SLURM_EXCLUDE_NODES="${SLURM_EXCLUDE_NODES:-chi-mi300x-033,chi-mi300x-037,chi-mi300x-049}"
 
-if [ -z "$JOB_ID" ]; then
-    echo "ERROR: salloc failed to allocate a job"
-    exit 1
-fi
+if [[ "$IS_MULTINODE" == "true" ]]; then
+    set -x
+
+    export SLURM_ACCOUNT="$USER"
+    export SLURM_PARTITION="compute"
+    export SLURM_JOB_NAME="benchmark-${FRAMEWORK}.job"
+
+    export MODEL_NAME=${MODEL##*/}
+    export MODEL_PATH="/raid/hf-hub-cache"
+    export IBDEVICES="bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7"
+    export MORI_RDMA_TC=104
+    export VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260629-e667ebb}"
+
+    export MODEL_DIR="$MODEL_PATH"
+    export GPUS_PER_NODE=8
+
+    export ISL="$ISL"
+    export OSL="$OSL"
+
+    export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$GITHUB_WORKSPACE/benchmark_logs}"
+    mkdir -p "$BENCHMARK_LOGS_DIR"
+    sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true
+
+    save_multinode_diagnostics() {
+        local art_dir="$GITHUB_WORKSPACE/benchmark_artifacts"
+        mkdir -p "$art_dir"
+
+        cp -r "$BENCHMARK_LOGS_DIR"/submit_*.log "$art_dir/" 2>/dev/null || true
+        if [[ "${JOB_ID:-}" =~ ^[0-9]+$ ]]; then
+            cp -r "$BENCHMARK_LOGS_DIR"/slurm_job-${JOB_ID}.{out,err} "$art_dir/" 2>/dev/null || true
+            scontrol show job "$JOB_ID" > "$art_dir/scontrol_job_${JOB_ID}.txt" 2>&1 || true
+            sacct -j "$JOB_ID" --format=JobID,JobName,State,ExitCode,Elapsed,NodeList%80 > "$art_dir/sacct_job_${JOB_ID}.txt" 2>&1 || true
+        fi
+
+        squeue -u "$USER" > "$art_dir/squeue_${USER}.txt" 2>&1 || true
+        {
+            echo "RUNNER_NAME=${RUNNER_NAME:-}"
+            echo "RUNNER_TYPE=${RUNNER_TYPE:-}"
+            echo "SLURM_ACCOUNT=${SLURM_ACCOUNT:-}"
+            echo "SLURM_PARTITION=${SLURM_PARTITION:-}"
+            echo "SLURM_EXCLUDE_NODES=${SLURM_EXCLUDE_NODES:-}"
+            echo "NODELIST=${NODELIST:-}"
+            echo "SCRIPT_NAME=${SCRIPT_NAME:-}"
+            echo "BENCHMARK_SUBDIR=${BENCHMARK_SUBDIR:-}"
+            echo "BENCHMARK_LOGS_DIR=${BENCHMARK_LOGS_DIR:-}"
+            echo "MODEL=${MODEL:-}"
+            echo "MODEL_NAME=${MODEL_NAME:-}"
+            echo "MODEL_PATH=${MODEL_PATH:-}"
+            echo "FRAMEWORK=${FRAMEWORK:-}"
+            echo "PRECISION=${PRECISION:-}"
+            echo "ISL=${ISL:-}"
+            echo "OSL=${OSL:-}"
+            echo "CONC_LIST=${CONC_LIST:-}"
+            echo "PREFILL_NODES=${PREFILL_NODES:-}"
+            echo "PREFILL_NUM_WORKERS=${PREFILL_NUM_WORKERS:-}"
+            echo "PREFILL_TP=${PREFILL_TP:-}"
+            echo "PREFILL_EP=${PREFILL_EP:-}"
+            echo "PREFILL_DP_ATTN=${PREFILL_DP_ATTN:-}"
+            echo "DECODE_NODES=${DECODE_NODES:-}"
+            echo "DECODE_NUM_WORKERS=${DECODE_NUM_WORKERS:-}"
+            echo "DECODE_TP=${DECODE_TP:-}"
+            echo "DECODE_EP=${DECODE_EP:-}"
+            echo "DECODE_DP_ATTN=${DECODE_DP_ATTN:-}"
+            echo "RUN_EVAL=${RUN_EVAL:-}"
+            echo "EVAL_ONLY=${EVAL_ONLY:-}"
+            echo "EVAL_CONC=${EVAL_CONC:-}"
+            echo "RESULT_FILENAME=${RESULT_FILENAME:-}"
+        } > "$art_dir/launcher_env.txt" 2>&1 || true
+
+        if compgen -G "$art_dir/*" > /dev/null; then
+            tar -czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$art_dir" . 2>/dev/null || true
+        fi
+    }
 
-# Use flock to serialize concurrent imports to the same squash file
-srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c "
-    exec 9>\"$LOCK_FILE\"
-    flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; }
-    if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then
-        echo 'Squash file already exists and is valid, skipping import'
+    cleanup_and_save_logs() {
+        if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
+            save_multinode_diagnostics
+        fi
+        if [[ "${STAGING_ALLOCATION_ID:-}" =~ ^[0-9]+$ ]] && [[ -n "$(squeue -j "$STAGING_ALLOCATION_ID" --noheader 2>/dev/null)" ]]; then
+            scancel "$STAGING_ALLOCATION_ID" || true
+        fi
+        [[ -n "${WORKSPACE_ARCHIVE:-}" ]] && rm -f "$WORKSPACE_ARCHIVE" 2>/dev/null || true
+        local err_file="$BENCHMARK_LOGS_DIR/slurm_job-${JOB_ID:-unknown}.err"
+        if [[ ! "${JOB_ID:-}" =~ ^[0-9]+$ ]]; then
+            err_file="$BENCHMARK_LOGS_DIR/slurm_job-unknown.err"
+        fi
+        if [[ -s "$err_file" ]]; then
+            echo "=== Slurm job stderr ==="
+            tail -100 "$err_file"
+            echo "========================"
+        fi
+        sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true
+    }
+    if [[ "${KEEP_LOGS:-0}" == "1" ]]; then
+        trap '' EXIT
     else
-        rm -f \"$SQUASH_FILE\"
-        enroot import -o \"$SQUASH_FILE\" docker://$IMAGE
-    fi
-"
-srun --jobid=$JOB_ID \
---container-image=$SQUASH_FILE \
---container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
---container-mount-home \
---container-writable \
---container-remap-root \
---container-workdir=/workspace/ \
---no-container-entrypoint --export=ALL \
-bash benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_mi300x.sh
-
-scancel $JOB_ID
\ No newline at end of file
+        trap cleanup_and_save_logs EXIT
+    fi
+
+    SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi300x_${FRAMEWORK}.sh"
+    if [[ "$FRAMEWORK" == "sglang-disagg" ]] || [[ "$FRAMEWORK" == "vllm-disagg" ]]; then
+        BENCHMARK_SUBDIR="multi_node"
+    else
+        BENCHMARK_SUBDIR="single_node"
+    fi
+
+    NUM_NODES_REQUIRED=$((PREFILL_NODES + DECODE_NODES))
+    SUBMIT_HOST=$(hostname -s)
+    SANITIZED_RUNNER=$(printf '%s' "${RUNNER_NAME:-runner}" | tr -c 'a-zA-Z0-9_.-' '_')
+    STAGED_WORKSPACE="/tmp/inferencex-${USER}-${GITHUB_RUN_ID:-manual}-${SANITIZED_RUNNER}"
+    JOB_BENCHMARK_LOGS_DIR="${STAGED_WORKSPACE}/benchmark_logs"
+    WORKSPACE_ARCHIVE="${RUNNER_TEMP:-/tmp}/inferencex-workspace-${USER}-${GITHUB_RUN_ID:-manual}-${SANITIZED_RUNNER}.tar"
+
+    NODELIST_DISCOVERY_TIMEOUT="${NODELIST_DISCOVERY_TIMEOUT:-900}"
+    NODELIST_DISCOVERY_INTERVAL="${NODELIST_DISCOVERY_INTERVAL:-30}"
+    NODELIST_ALLOCATION_TIMEOUT="${NODELIST_ALLOCATION_TIMEOUT:-120}"
+    NODELIST_ALLOC_IMMEDIATE="${NODELIST_ALLOC_IMMEDIATE:-${NODELIST_PROBE_IMMEDIATE:-30}}"
+    STAGING_TIMEOUT_SECONDS="${STAGING_TIMEOUT_SECONDS:-600}"
+    STAGING_RETRIES="${STAGING_RETRIES:-3}"
+    STAGING_RETRY_DELAY="${STAGING_RETRY_DELAY:-20}"
+    MI300X_ALLOCATION_TIME="${MI300X_ALLOCATION_TIME:-08:30:00}"
+    MI300X_NODE_INVENTORY="${MI300X_NODE_INVENTORY:-chi-mi300x-034 chi-mi300x-035 chi-mi300x-036 chi-mi300x-043 chi-mi300x-049 chi-mi300x-054 chi-mi300x-057 chi-mi300x-058 chi-mi300x-121}"
+
+    echo "Creating reusable workspace archive at ${WORKSPACE_ARCHIVE}"
+    rm -f "$WORKSPACE_ARCHIVE"
+    tar \
+        --exclude='./benchmark_logs' \
+        --exclude='./benchmark_artifacts' \
+        --exclude='./multinode_server_logs.tar.gz' \
+        --exclude='./.git' \
+        -C "$GITHUB_WORKSPACE" -cf "$WORKSPACE_ARCHIVE" .
+
+    allocate_nodes() {
+        local requested_nodelist=$1
+        local alloc_output
+        local alloc_id
+        local errexit_was_set=0
+
+        [[ $- == *e* ]] && errexit_was_set=1
+
+        echo "Requesting exclusive MI300X allocation from NODELIST=${requested_nodelist}"
+        set +e
+        alloc_output=$(timeout "${NODELIST_ALLOCATION_TIMEOUT}s" salloc \
+            --immediate="$NODELIST_ALLOC_IMMEDIATE" \
+            --partition="$SLURM_PARTITION" \
+            --account="$SLURM_ACCOUNT" \
+            --exclusive \
+            --gres="gpu:${GPUS_PER_NODE}" \
+            -N "$NUM_NODES_REQUIRED" \
+            -n "$NUM_NODES_REQUIRED" \
+            --nodelist="$requested_nodelist" \
+            --time="$MI300X_ALLOCATION_TIME" \
+            --no-shell \
+            --job-name="$RUNNER_NAME" 2>&1)
+        local alloc_rc=$?
+        if [[ "$errexit_was_set" -eq 1 ]]; then
+            set -e
+        else
+            set +e
+        fi
+        printf '%s\n' "$alloc_output"
+
+        if [[ "$alloc_rc" -ne 0 ]]; then
+            return 1
+        fi
+
+        alloc_id=$(awk '/Granted job allocation/ {print $NF}' <<< "$alloc_output" | tail -n 1)
+        if [[ ! "$alloc_id" =~ ^[0-9]+$ ]]; then
+            echo "ERROR: Failed to parse allocation id from salloc output" >&2
+            return 1
+        fi
+
+        STAGING_ALLOCATION_ID="$alloc_id"
+        export STAGING_ALLOCATION_ID
+        return 0
+    }
+
+    stage_workspace_local() {
+        echo "Staging workspace locally on ${SUBMIT_HOST}:${STAGED_WORKSPACE}"
+        rm -rf "$STAGED_WORKSPACE"
+        mkdir -p "$STAGED_WORKSPACE" "$JOB_BENCHMARK_LOGS_DIR"
+        tar -C "$STAGED_WORKSPACE" -xf "$WORKSPACE_ARCHIVE"
+        test -f "$STAGED_WORKSPACE/benchmarks/multi_node/amd_utils/job.slurm"
+        test -d "$JOB_BENCHMARK_LOGS_DIR"
+    }
+
+    stage_workspace_remote_once() {
+        local node=$1
+        timeout "${STAGING_TIMEOUT_SECONDS}s" srun --jobid="$STAGING_ALLOCATION_ID" --overlap \
+            --nodes=1 --ntasks=1 --nodelist="$node" \
+            bash -lc "rm -rf '$STAGED_WORKSPACE' && mkdir -p '$STAGED_WORKSPACE' '$JOB_BENCHMARK_LOGS_DIR' && tar -C '$STAGED_WORKSPACE' -xf - && test -f '$STAGED_WORKSPACE/benchmarks/multi_node/amd_utils/job.slurm' && test -d '$JOB_BENCHMARK_LOGS_DIR'" \
+            < "$WORKSPACE_ARCHIVE"
+    }
+
+    stage_workspace_remote() {
+        local node=$1
+        local attempt
+
+        for ((attempt = 1; attempt <= STAGING_RETRIES; attempt++)); do
+            echo "Staging workspace to ${node}:${STAGED_WORKSPACE} (attempt ${attempt}/${STAGING_RETRIES})"
+            if stage_workspace_remote_once "$node"; then
+                return 0
+            fi
+            echo "WARNING: Failed to stage workspace on ${node} (attempt ${attempt}/${STAGING_RETRIES})" >&2
+            sleep "$STAGING_RETRY_DELAY"
+        done
+
+        return 1
+    }
+
+    if [[ -z "${NODELIST:-}" ]]; then
+        CANDIDATE_NODES=()
+        for candidate in $MI300X_NODE_INVENTORY; do
+            [[ -n "$candidate" ]] || continue
+            if [[ ",${SLURM_EXCLUDE_NODES}," == *",${candidate},"* ]]; then
+                echo "Skipping excluded MI300X allocation candidate: $candidate"
+                continue
+            fi
+            CANDIDATE_NODES+=("$candidate")
+        done
+
+        if [[ "${#CANDIDATE_NODES[@]}" -lt "$NUM_NODES_REQUIRED" ]]; then
+            echo "ERROR: Need ${NUM_NODES_REQUIRED} MI300X nodes but only ${#CANDIDATE_NODES[@]} candidates remain after exclusions." >&2
+            echo "MI300X node inventory checked: ${MI300X_NODE_INVENTORY}" >&2
+            echo "SLURM_EXCLUDE_NODES=${SLURM_EXCLUDE_NODES:-}" >&2
+            exit 1
+        fi
+
+        REQUESTED_NODELIST=$(IFS=,; echo "${CANDIDATE_NODES[*]}")
+        discovery_start=$(date +%s)
+
+        while true; do
+            if allocate_nodes "$REQUESTED_NODELIST"; then
+                break
+            fi
+
+            now=$(date +%s)
+            elapsed=$((now - discovery_start))
+            if (( elapsed >= NODELIST_DISCOVERY_TIMEOUT )); then
+                echo "ERROR: Failed to allocate ${NUM_NODES_REQUIRED} exclusive MI300X nodes after ${elapsed}s." >&2
+                echo "Requested NODELIST=${REQUESTED_NODELIST}" >&2
+                echo "MI300X node inventory checked: ${MI300X_NODE_INVENTORY}" >&2
+                sinfo -N -p "$SLURM_PARTITION" -o "%N %T" >&2 || true
+                exit 1
+            fi
+
+            echo "Allocation not available yet; retrying in ${NODELIST_DISCOVERY_INTERVAL}s..."
+            sleep "$NODELIST_DISCOVERY_INTERVAL"
+        done
+
+    else
+        echo "Using caller-provided NODELIST=${NODELIST}"
+        IFS=',' read -r -a SELECTED_NODES <<< "$NODELIST"
+        if ! allocate_nodes "$NODELIST"; then
+            echo "ERROR: Failed to allocate caller-provided NODELIST=${NODELIST}" >&2
+            exit 1
+        fi
+    fi
+
+    ALLOC_NODELIST=$(squeue -h -j "$STAGING_ALLOCATION_ID" -o '%N')
+    if [[ -z "$ALLOC_NODELIST" ]]; then
+        echo "ERROR: Failed to resolve nodelist for allocation ${STAGING_ALLOCATION_ID}" >&2
+        exit 1
+    fi
+    mapfile -t SELECTED_NODES < <(scontrol show hostnames "$ALLOC_NODELIST")
+    NODELIST=$(IFS=,; echo "${SELECTED_NODES[*]}")
+    echo "Using allocated NODELIST=${NODELIST} from allocation ${STAGING_ALLOCATION_ID}"
+
+    export NODELIST
+    export SLURM_REUSE_JOBID="$STAGING_ALLOCATION_ID"
+    export SLURM_JOB_NODELIST="$ALLOC_NODELIST"
+    export SLURM_NODELIST="$ALLOC_NODELIST"
+
+    stage_workspace_local
+    for node in "${SELECTED_NODES[@]}"; do
+        if [[ "$node" == "$SUBMIT_HOST" ]]; then
+            echo "Skipping remote staging for local allocation node ${node}"
+            continue
+        fi
+        if ! stage_workspace_remote "$node"; then
+            echo "ERROR: Failed to stage workspace on ${node}" >&2
+            exit 1
+        fi
+    done
+
+    BENCHMARK_LOGS_DIR="$JOB_BENCHMARK_LOGS_DIR"
+    export BENCHMARK_LOGS_DIR
+
+    SUBMIT_LOG="$BENCHMARK_LOGS_DIR/submit_${SCRIPT_NAME%.sh}.log"
+    GITHUB_WORKSPACE="$STAGED_WORKSPACE" bash "$STAGED_WORKSPACE/benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME}" > "$SUBMIT_LOG" 2>&1
+    SUBMIT_RC=$?
+    cat "$SUBMIT_LOG"
+    JOB_ID=$(grep -E '^[0-9]+$' "$SUBMIT_LOG" | tail -n 1 || true)
+    if [[ "$SUBMIT_RC" -ne 0 ]]; then
+        echo "ERROR: Failed to submit multi-node job via benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME}"
+        echo "=== Submit log ==="
+        cat "$SUBMIT_LOG" || true
+        echo "=================="
+        exit 1
+    fi
+
+    if [[ ! "$JOB_ID" =~ ^[0-9]+$ ]]; then
+        echo "ERROR: Expected numeric Slurm job id, got '$JOB_ID'"
+        echo "=== Submit log ==="
+        cat "$SUBMIT_LOG" || true
+        echo "=================="
+        exit 1
+    fi
+
+    LOG_FILE="$BENCHMARK_LOGS_DIR/slurm_job-${JOB_ID}.out"
+    REUSE_JOB_PID_FILE="$BENCHMARK_LOGS_DIR/slurm_job-${JOB_ID}.pid"
+
+    sleep 10
+
+    while ! ls "$LOG_FILE" &>/dev/null; do
+        if [[ -n "${SLURM_REUSE_JOBID:-}" ]]; then
+            if [[ ! -f "$REUSE_JOB_PID_FILE" ]]; then
+                echo "ERROR: Missing reused job pid file $REUSE_JOB_PID_FILE before log file was created"
+                save_multinode_diagnostics
+                exit 1
+            fi
+            REUSE_JOB_PID=$(<"$REUSE_JOB_PID_FILE")
+            if [[ ! "$REUSE_JOB_PID" =~ ^[0-9]+$ ]] || ! kill -0 "$REUSE_JOB_PID" 2>/dev/null; then
+                echo "ERROR: Reused job $JOB_ID exited before creating log file"
+                save_multinode_diagnostics
+                exit 1
+            fi
+        else
+            if ! squeue -u "$USER" --noheader --format='%i' | grep -q "$JOB_ID"; then
+                echo "ERROR: Job $JOB_ID failed before creating log file"
+                scontrol show job "$JOB_ID" || true
+                save_multinode_diagnostics
+                exit 1
+            fi
+        fi
+        sleep 5
+    done
+
+    set +x
+
+    if [[ -n "${SLURM_REUSE_JOBID:-}" ]]; then
+        REUSE_JOB_PID=$(<"$REUSE_JOB_PID_FILE")
+        (
+            while kill -0 "$REUSE_JOB_PID" 2>/dev/null; do
+                sleep 10
+            done
+        ) &
+    else
+        (
+            while squeue -u $USER --noheader --format='%i' | grep -q "$JOB_ID"; do
+                sleep 10
+            done
+        ) &
+    fi
+    POLL_PID=$!
+
+    tail -F -s 2 -n+1 "$LOG_FILE" --pid=$POLL_PID 2>/dev/null
+
+    wait $POLL_PID
+
+    set -x
+
+    if [[ "${EVAL_ONLY:-false}" != "true" ]]; then
+        cat > collect_latest_results.py <<'PY'
+import os, sys
+job_dir, isl, osl, nexp, framework = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]), sys.argv[5]
+logs_root = f"{job_dir}/logs/"
+candidates = []
+if os.path.isdir(logs_root):
+    for name in os.listdir(logs_root):
+        subdir = f"{logs_root}{name}/{framework}_isl_{isl}_osl_{osl}"
+        if os.path.isdir(subdir):
+            candidates.append(subdir)
+for path in sorted(candidates, key=os.path.getmtime, reverse=True)[:nexp]:
+    print(path)
+PY
+
+        LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1 "$FRAMEWORK")
+        if [ -z "$LOGS_DIR" ]; then
+            echo "No logs directory found for ISL=${ISL}, OSL=${OSL}"
+            exit 1
+        fi
+
+        echo "Found logs directory: $LOGS_DIR"
+        ls -la "$LOGS_DIR"
+
+        for result_file in $(find $LOGS_DIR -type f); do
+            file_name=$(basename $result_file)
+            if [ -f $result_file ]; then
+                WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${file_name}"
+                echo "Found result file ${result_file}. Copying it to ${WORKSPACE_RESULT_FILE}"
+                cp $result_file $WORKSPACE_RESULT_FILE
+            fi
+        done
+    fi
+
+    if [[ "${RUN_EVAL:-false}" == "true" ]]; then
+        EVAL_DIR=$(find "$BENCHMARK_LOGS_DIR/logs" -type d -name eval_results 2>/dev/null | head -1)
+        if [ -n "$EVAL_DIR" ] && [ -d "$EVAL_DIR" ]; then
+            echo "Extracting eval results from $EVAL_DIR"
+            shopt -s nullglob
+            for eval_file in "$EVAL_DIR"/*; do
+                [ -f "$eval_file" ] || continue
+                cp "$eval_file" "$GITHUB_WORKSPACE/"
+                echo "Copied eval artifact: $(basename "$eval_file")"
+            done
+            shopt -u nullglob
+        else
+            echo "WARNING: RUN_EVAL=true but no eval results found under $BENCHMARK_LOGS_DIR/logs"
+        fi
+    fi
+
+    echo "All result files processed"
+    set +x
+    scancel_sync $JOB_ID
+    set -x
+    echo "Canceled the slurm job $JOB_ID"
+
+    sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true
+
+else
+
+    export HF_HUB_CACHE_MOUNT="/raid/hf-hub-cache/"
+    export PORT=8888
+
+    PARTITION="compute"
+    SQUASH_FILE="/home/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+    LOCK_FILE="${SQUASH_FILE}.lock"
+
+    set -x
+
+    EXCLUDE_OPT=()
+    if [[ -n "${SLURM_EXCLUDE_NODES:-}" ]]; then
+        EXCLUDE_OPT=(--exclude "$SLURM_EXCLUDE_NODES")
+    fi
+
+    JOB_ID=$(salloc --partition=$PARTITION "${EXCLUDE_OPT[@]}" --gres=gpu:$TP --cpus-per-task=256 --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+')
+
+    if [ -z "$JOB_ID" ]; then
+        echo "ERROR: salloc failed to allocate a job"
+        exit 1
+    fi
+
+    # Use flock to serialize concurrent imports to the same squash file
+    srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c "
+        exec 9>\"$LOCK_FILE\"
+        flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; }
+        if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then
+            echo 'Squash file already exists and is valid, skipping import'
+        else
+            rm -f \"$SQUASH_FILE\"
+            enroot import -o \"$SQUASH_FILE\" docker://$IMAGE
+        fi
+    "
+    srun --jobid=$JOB_ID \
+    --container-image=$SQUASH_FILE \
+    --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
+    --container-mount-home \
+    --container-writable \
+    --container-remap-root \
+    --container-workdir=/workspace/ \
+    --no-container-entrypoint --export=ALL \
+    bash benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_mi300x.sh
+
+    scancel $JOB_ID
+fi
diff --git a/runners/launch_mi325x-amds.sh b/runners/launch_mi325x-amds.sh
index 810cbde2f..1e0b25ed5 100644
--- a/runners/launch_mi325x-amds.sh
+++ b/runners/launch_mi325x-amds.sh
@@ -1,44 +1,204 @@
 #!/usr/bin/env bash
 
-export HF_HUB_CACHE_MOUNT="/nfsdata/sa/gharunner/gharunners/hf-hub-cache/"
-export PORT=8888
+scancel_sync() {
+    local jobid=$1
+    local timeout=${2:-600}
+    local interval=10
+    local start
+    start=$(date +%s)
 
-PARTITION="compute"
-SQUASH_FILE="/nfsdata/sa/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
-LOCK_FILE="${SQUASH_FILE}.lock"
+    echo "[scancel_sync] Requesting cancel of job $jobid"
+    scancel "$jobid" || true
 
-set -x
+    while [[ -n "$(squeue -j "$jobid" --noheader 2>/dev/null)" ]]; do
+        local now
+        now=$(date +%s)
+        if (( now - start >= timeout )); then
+            echo "[scancel_sync][WARN] job $jobid still present after ${timeout}s"
+            return 1
+        fi
+        echo "[scancel_sync] waiting for job $jobid to exit. $((timeout-(now-start))) secs remaining..."
+        sleep "$interval"
+    done
+    echo "[scancel_sync] job $jobid exited"
+    return 0
+}
 
 # Exclude known-broken mi325x nodes:
 #   chi-mi325x-pod1-121: enroot-aufs2ovlfs setcap fails on this node's NFS-backed
 #                        squash dir; container image import never completes
 #                        (root-caused via #1467/#1468/#1469 sweep failures).
-JOB_ID=$(salloc --partition=$PARTITION --exclude=chi-mi325x-pod1-121.ord.vultr.cpe.ice.amd.com --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+')
+export SLURM_EXCLUDE_NODES="${SLURM_EXCLUDE_NODES:-chi-mi325x-pod1-021.ord.vultr.cpe.ice.amd.com,chi-mi325x-pod1-027.ord.vultr.cpe.ice.amd.com,chi-mi325x-pod1-028.ord.vultr.cpe.ice.amd.com,chi-mi325x-pod1-030.ord.vultr.cpe.ice.amd.com,chi-mi325x-pod1-121.ord.vultr.cpe.ice.amd.com}"
 
-if [ -z "$JOB_ID" ]; then
-    echo "ERROR: salloc failed to allocate a job"
-    exit 1
-fi
 
-# Use flock to serialize concurrent imports to the same squash file
-srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c "
-    exec 9>\"$LOCK_FILE\"
-    flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; }
-    if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then
-        echo 'Squash file already exists and is valid, skipping import'
+if [[ "$IS_MULTINODE" == "true" ]]; then
+    set -x
+
+    export SLURM_ACCOUNT="$USER"
+    export SLURM_PARTITION="compute"
+    export SLURM_JOB_NAME="benchmark-${FRAMEWORK}.job"
+
+    export MODEL_NAME=${MODEL##*/}
+    export MODEL_PATH="/nfsdata/sa/gharunner/gharunners/hf-hub-cache"
+    export IBDEVICES="rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7"
+    export MORI_RDMA_TC=104
+
+    export MODEL_DIR="$MODEL_PATH"
+    export GPUS_PER_NODE=8
+
+    export ISL="$ISL"
+    export OSL="$OSL"
+
+    export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$GITHUB_WORKSPACE/benchmark_logs}"
+    mkdir -p "$BENCHMARK_LOGS_DIR"
+    rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true
+
+    cleanup_and_save_logs() {
+        if [[ -n "${GITHUB_ACTIONS:-}" && -n "${JOB_ID:-}" ]]; then
+            local art_dir="$GITHUB_WORKSPACE/benchmark_artifacts"
+            mkdir -p "$art_dir"
+            cp -r "$BENCHMARK_LOGS_DIR"/slurm_job-${JOB_ID}.{out,err} "$art_dir/" 2>/dev/null || true
+        fi
+        local err_file="$BENCHMARK_LOGS_DIR/slurm_job-${JOB_ID:-unknown}.err"
+        if [[ -s "$err_file" ]]; then
+            echo "=== Slurm job stderr ==="
+            tail -100 "$err_file"
+            echo "========================"
+        fi
+        rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true
+    }
+    trap cleanup_and_save_logs EXIT
+
+    SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi325x_${FRAMEWORK}.sh"
+    if [[ "$FRAMEWORK" == "sglang-disagg" ]] || [[ "$FRAMEWORK" == "vllm-disagg" ]]; then
+        BENCHMARK_SUBDIR="multi_node"
     else
-        rm -f \"$SQUASH_FILE\"
-        enroot import -o \"$SQUASH_FILE\" docker://$IMAGE
+        BENCHMARK_SUBDIR="single_node"
     fi
-"
-srun --jobid=$JOB_ID \
---container-image=$SQUASH_FILE \
---container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
---container-mount-home \
---container-writable \
---container-remap-root \
---container-workdir=/workspace/ \
---no-container-entrypoint --export=ALL \
-bash benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_mi325x.sh
-
-scancel $JOB_ID
+    JOB_ID=$(bash "benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME}")
+
+    LOG_FILE="$BENCHMARK_LOGS_DIR/slurm_job-${JOB_ID}.out"
+
+    sleep 10
+
+    while ! ls "$LOG_FILE" &>/dev/null; do
+        if ! squeue -u "$USER" --noheader --format='%i' | grep -q "$JOB_ID"; then
+            echo "ERROR: Job $JOB_ID failed before creating log file"
+            scontrol show job "$JOB_ID"
+            exit 1
+        fi
+        sleep 5
+    done
+
+    set +x
+
+    (
+        while squeue -u $USER --noheader --format='%i' | grep -q "$JOB_ID"; do
+            sleep 10
+        done
+    ) &
+    POLL_PID=$!
+
+    tail -F -s 2 -n+1 "$LOG_FILE" --pid=$POLL_PID 2>/dev/null
+
+    wait $POLL_PID
+
+    set -x
+
+    if [[ "${EVAL_ONLY:-false}" != "true" ]]; then
+        cat > collect_latest_results.py <<'PY'
+import os, sys
+sgl_job_dir, isl, osl, nexp, framework = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]), sys.argv[5]
+for path in sorted([f"{sgl_job_dir}/logs/{name}/{framework}_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/{framework}_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]:
+    print(path)
+PY
+
+        LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1 "$FRAMEWORK")
+        if [ -z "$LOGS_DIR" ]; then
+            echo "No logs directory found for ISL=${ISL}, OSL=${OSL}"
+            exit 1
+        fi
+
+        echo "Found logs directory: $LOGS_DIR"
+        ls -la "$LOGS_DIR"
+
+        for result_file in $(find $LOGS_DIR -type f); do
+            file_name=$(basename $result_file)
+            if [ -f $result_file ]; then
+                WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${file_name}"
+                echo "Found result file ${result_file}. Copying it to ${WORKSPACE_RESULT_FILE}"
+                cp $result_file $WORKSPACE_RESULT_FILE
+            fi
+        done
+    fi
+
+    if [[ "${RUN_EVAL:-false}" == "true" ]]; then
+        EVAL_DIR=$(find "$BENCHMARK_LOGS_DIR/logs" -type d -name eval_results 2>/dev/null | head -1)
+        if [ -n "$EVAL_DIR" ] && [ -d "$EVAL_DIR" ]; then
+            echo "Extracting eval results from $EVAL_DIR"
+            shopt -s nullglob
+            for eval_file in "$EVAL_DIR"/*; do
+                [ -f "$eval_file" ] || continue
+                cp "$eval_file" "$GITHUB_WORKSPACE/"
+                echo "Copied eval artifact: $(basename "$eval_file")"
+            done
+            shopt -u nullglob
+        else
+            echo "WARNING: RUN_EVAL=true but no eval results found under $BENCHMARK_LOGS_DIR/logs"
+        fi
+    fi
+
+    echo "All result files processed"
+    set +x
+    scancel_sync $JOB_ID
+    set -x
+    echo "Canceled the slurm job $JOB_ID"
+
+    rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true
+
+else
+
+    export HF_HUB_CACHE_MOUNT="/nfsdata/sa/gharunner/gharunners/hf-hub-cache/"
+    export PORT=8888
+
+    PARTITION="compute"
+    SQUASH_FILE="/nfsdata/sa/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+    LOCK_FILE="${SQUASH_FILE}.lock"
+
+    set -x
+
+    EXCLUDE_OPT=()
+    if [[ -n "${SLURM_EXCLUDE_NODES:-}" ]]; then
+        EXCLUDE_OPT=(--exclude "$SLURM_EXCLUDE_NODES")
+    fi
+
+    JOB_ID=$(salloc --partition=$PARTITION "${EXCLUDE_OPT[@]}" --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+')
+
+    if [ -z "$JOB_ID" ]; then
+        echo "ERROR: salloc failed to allocate a job"
+        exit 1
+    fi
+
+    # Use flock to serialize concurrent imports to the same squash file
+    srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c "
+        exec 9>\"$LOCK_FILE\"
+        flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; }
+        if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then
+            echo 'Squash file already exists and is valid, skipping import'
+        else
+            rm -f \"$SQUASH_FILE\"
+            enroot import -o \"$SQUASH_FILE\" docker://$IMAGE
+        fi
+    "
+    srun --jobid=$JOB_ID \
+    --container-image=$SQUASH_FILE \
+    --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
+    --container-mount-home \
+    --container-writable \
+    --container-remap-root \
+    --container-workdir=/workspace/ \
+    --no-container-entrypoint --export=ALL \
+    bash benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_mi325x.sh
+
+    scancel $JOB_ID
+fi
diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh
index 2f700d4e7..8ea9f2d5a 100644
--- a/runners/launch_mi355x-amds.sh
+++ b/runners/launch_mi355x-amds.sh
@@ -52,11 +52,27 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
     sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true
 
     # Ensure root-owned files are cleaned up even on early exit to prevent
-    # EACCES errors when the next GH Actions job checks out on this runner
-    trap 'sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true' EXIT
+    # EACCES errors when the next GH Actions job checks out on this runner.
+    # Always preserve slurm logs as CI artifacts for debugging.
+    cleanup_and_save_logs() {
+        if [[ -n "${GITHUB_ACTIONS:-}" && -n "${JOB_ID:-}" ]]; then
+            local art_dir="$GITHUB_WORKSPACE/benchmark_artifacts"
+            mkdir -p "$art_dir"
+            cp -r "$BENCHMARK_LOGS_DIR"/slurm_job-${JOB_ID}.{out,err} "$art_dir/" 2>/dev/null || true
+        fi
+        # Print .err inline so failures are visible in CI output
+        local err_file="$BENCHMARK_LOGS_DIR/slurm_job-${JOB_ID:-unknown}.err"
+        if [[ -s "$err_file" ]]; then
+            echo "=== Slurm job stderr ==="
+            tail -100 "$err_file"
+            echo "========================"
+        fi
+        sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true
+    }
+    trap cleanup_and_save_logs EXIT
 
     SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi355x_${FRAMEWORK}.sh"
-    if [[ "$FRAMEWORK" == "sglang-disagg" ]]; then
+    if [[ "$FRAMEWORK" == "sglang-disagg" ]] || [[ "$FRAMEWORK" == "vllm-disagg" ]]; then
         BENCHMARK_SUBDIR="multi_node"
     else
         BENCHMARK_SUBDIR="single_node"
@@ -108,12 +124,12 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
     if [[ "${EVAL_ONLY:-false}" != "true" ]]; then
         cat > collect_latest_results.py <<'PY'
 import os, sys
-sgl_job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4])
-for path in sorted([f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]:
+sgl_job_dir, isl, osl, nexp, framework = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]), sys.argv[5]
+for path in sorted([f"{sgl_job_dir}/logs/{name}/{framework}_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/{framework}_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]:
     print(path)
 PY
 
-        LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1)
+        LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1 "$FRAMEWORK")
         if [ -z "$LOGS_DIR" ]; then
             echo "No logs directory found for ISL=${ISL}, OSL=${OSL}"
             exit 1
@@ -162,16 +178,7 @@ PY
 
     sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true
 
-    # Upload logs as artifact if running in GitHub Actions
-    if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
-        ARTIFACT_DIR="$GITHUB_WORKSPACE/benchmark_artifacts"
-        mkdir -p "$ARTIFACT_DIR"
-        cp -r "$BENCHMARK_LOGS_DIR"/slurm_job-${JOB_ID}.{out,err} "$ARTIFACT_DIR/" 2>/dev/null || true
-        echo "Logs copied to $ARTIFACT_DIR for artifact upload"
-    fi
-
-    # Clean up root-owned files to prevent EACCES on GH Actions checkout cleanup
-    sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true
+    # Log preservation and cleanup handled by EXIT trap (cleanup_and_save_logs)
 
 else
 
diff --git a/utils/bench_serving/backend_request_func.py b/utils/bench_serving/backend_request_func.py
index 4c8820f8d..9c4221781 100644
--- a/utils/bench_serving/backend_request_func.py
+++ b/utils/bench_serving/backend_request_func.py
@@ -421,10 +421,13 @@ async def async_request_openai_chat_completions(
             output.success = False
             exc_info = sys.exc_info()
             output.error = "".join(traceback.format_exception(*exc_info))
+        finally:
+            if _own_session:
+                await session.close()
 
-    if pbar:
-        pbar.update(1)
-    return output
+        if pbar:
+            pbar.update(1)
+        return output
 
 
 def get_model(pretrained_model_name_or_path: str) -> str: