From ab7c6b20915f9a43012e63f9de4d22beabab88cc Mon Sep 17 00:00:00 2001 From: fuyuajin Date: Sun, 28 Jun 2026 13:53:25 +0000 Subject: [PATCH] Update Qwen3.5 FP4 MI355X MTP recipe with tuned env/flags Apply the validated MI355X TP=2 decode recipe to the single-node MTP benchmark (env/flags only; tuned aiter CSVs and sglang kernel patches are upstreamed separately): - speculative-algorithm NEXTN (built-in MTP head) + spec-v1 chain path - INT4 quick-reduce + single-stage AITER AllReduce - enable dp-attention, mamba-ssm-dtype bf16, disable shared-experts fusion - drop allreduce-fusion / unified-attn / flydsl-force / page-size 16 - guard hf download for local model paths --- .../fixed_seq_len/qwen3.5_fp4_mi355x_mtp.sh | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x_mtp.sh index 8081b824e..00ed32f3a 100755 --- a/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x_mtp.sh @@ -15,14 +15,17 @@ if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi -hf download "$MODEL" +if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi export SGLANG_USE_AITER=1 -export SGLANG_USE_AITER_UNIFIED_ATTN=1 -export AITER_FLYDSL_FORCE=1 +# AllReduce latency on TP=2: INT4 quick-reduce + single-stage AITER AllReduce. +export ROCM_QUICK_REDUCE_QUANTIZATION=INT4 +export AITER_AR_1STAGE=1 +# Built-in MTP head (NEXTN) runs on the ROCm-hardened spec-v1 chain path. +export SGLANG_ENABLE_SPEC_V2=0 SERVER_LOG=/workspace/server.log -MEM_FRAC_STATIC=${MEM_FRAC_STATIC:-0.8} +MEM_FRAC_STATIC=${MEM_FRAC_STATIC:-0.85} if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context @@ -40,9 +43,11 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \ --model-loader-extra-config '{"enable_multithread_load": true}' \ --watchdog-timeout 1200 \ --disable-radix-cache \ ---enable-aiter-allreduce-fusion --max-running-requests $CONC \ ---page-size 16 \ ---speculative-algorithm EAGLE \ +--enable-dp-attention \ +--mamba-ssm-dtype bfloat16 \ +--disable-shared-experts-fusion \ +--max-running-requests $CONC \ +--speculative-algorithm NEXTN \ --speculative-num-steps 3 \ --speculative-eagle-topk 1 \ --speculative-num-draft-tokens 4 \