From ab7c6b20915f9a43012e63f9de4d22beabab88cc Mon Sep 17 00:00:00 2001
From: fuyuajin <fuyuajin@users.noreply.github.com>
Date: Sun, 28 Jun 2026 13:53:25 +0000
Subject: [PATCH] Update Qwen3.5 FP4 MI355X MTP recipe with tuned env/flags

Apply the validated MI355X TP=2 decode recipe to the single-node MTP
benchmark (env/flags only; tuned aiter CSVs and sglang kernel patches are
upstreamed separately):
- speculative-algorithm NEXTN (built-in MTP head) + spec-v1 chain path
- INT4 quick-reduce + single-stage AITER AllReduce
- enable dp-attention, mamba-ssm-dtype bf16, disable shared-experts fusion
- drop allreduce-fusion / unified-attn / flydsl-force / page-size 16
- guard hf download for local model paths
---
 .../fixed_seq_len/qwen3.5_fp4_mi355x_mtp.sh   | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x_mtp.sh
index 8081b824e..00ed32f3a 100755
--- a/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x_mtp.sh
@@ -15,14 +15,17 @@ if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
 
-hf download "$MODEL"
+if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 export SGLANG_USE_AITER=1
-export SGLANG_USE_AITER_UNIFIED_ATTN=1
-export AITER_FLYDSL_FORCE=1
+# AllReduce latency on TP=2: INT4 quick-reduce + single-stage AITER AllReduce.
+export ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+export AITER_AR_1STAGE=1
+# Built-in MTP head (NEXTN) runs on the ROCm-hardened spec-v1 chain path.
+export SGLANG_ENABLE_SPEC_V2=0
 
 SERVER_LOG=/workspace/server.log
-MEM_FRAC_STATIC=${MEM_FRAC_STATIC:-0.8}
+MEM_FRAC_STATIC=${MEM_FRAC_STATIC:-0.85}
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
@@ -40,9 +43,11 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \
 --model-loader-extra-config '{"enable_multithread_load": true}' \
 --watchdog-timeout 1200  \
 --disable-radix-cache \
---enable-aiter-allreduce-fusion --max-running-requests $CONC \
---page-size 16 \
---speculative-algorithm EAGLE \
+--enable-dp-attention \
+--mamba-ssm-dtype bfloat16 \
+--disable-shared-experts-fusion \
+--max-running-requests $CONC \
+--speculative-algorithm NEXTN \
 --speculative-num-steps 3 \
 --speculative-eagle-topk 1 \
 --speculative-num-draft-tokens 4 \