Fuse routed scaling factor into select_experts for FP4

trevor-m · trevor-m · commit acede7df5f2c · 2025-07-28T19:48:02.000Z
diff --git a/python/sglang/srt/layers/moe/topk.py b/python/sglang/srt/layers/moe/topk.py
@@ -132,6 +132,7 @@ def __init__(
         scoring_func: str = "softmax",
         correction_bias: Optional[torch.Tensor] = None,
         routed_scaling_factor: Optional[float] = None,
+        apply_routed_scaling_factor_on_output: Optional[bool] = False,
     ):
         # NOTE: scoring_func is not used for now, but we keep it for future use
         # see https://github.com/sgl-project/sglang/pull/4505 for more details
@@ -147,6 +148,9 @@ def __init__(
         self.custom_routing_function = custom_routing_function
         self.correction_bias = correction_bias
         self.routed_scaling_factor = routed_scaling_factor
+        self.apply_routed_scaling_factor_on_output = (
+            apply_routed_scaling_factor_on_output
+        )
 
         self.use_triton_kernels = global_server_args_dict["enable_triton_kernel_moe"]
 
@@ -204,6 +208,7 @@ def forward_cuda(
                 correction_bias=self.correction_bias,
                 torch_native=torch_native,
                 routed_scaling_factor=self.routed_scaling_factor,
+                apply_routed_scaling_factor_on_output=self.apply_routed_scaling_factor_on_output,
                 num_token_non_padded=num_token_non_padded,
                 expert_location_dispatch_info=expert_location_dispatch_info,
             )
@@ -372,6 +377,7 @@ def grouped_topk_gpu(
     topk_group: int = 0,
     num_fused_shared_experts: int = 0,
     routed_scaling_factor: Optional[float] = None,
+    apply_routed_scaling_factor_on_output: Optional[bool] = False,
     num_token_non_padded: Optional[torch.Tensor] = None,
     expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
 ):
@@ -417,6 +423,8 @@ def grouped_topk_gpu(
             else topk_weights[:, :-1].sum(dim=-1, keepdim=True)
         )
         topk_weights = topk_weights / topk_weights_sum
+        if apply_routed_scaling_factor_on_output:
+            topk_weights *= routed_scaling_factor
 
     topk_weights, topk_ids = topk_weights.to(torch.float32), topk_ids.to(torch.int32)
     topk_ids = topk_ids_logical_to_physical(topk_ids, expert_location_dispatch_info)
@@ -433,10 +441,12 @@ def grouped_topk_cpu(
     topk_group: int = 0,
     num_fused_shared_experts: int = 0,
     routed_scaling_factor: Optional[float] = None,
+    apply_routed_scaling_factor_on_output: Optional[bool] = False,
     num_token_non_padded: Optional[torch.Tensor] = None,
     expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
 ):
     assert expert_location_dispatch_info is None
+    assert not apply_routed_scaling_factor_on_output
     return torch.ops.sgl_kernel.grouped_topk_cpu(
         hidden_states,
         gating_output,
@@ -461,6 +471,7 @@ def biased_grouped_topk_impl(
     topk_group: int = 0,
     num_fused_shared_experts: int = 0,
     routed_scaling_factor: Optional[float] = None,
+    apply_routed_scaling_factor_on_output: Optional[bool] = False,
     num_token_non_padded: Optional[torch.Tensor] = None,
     expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
 ):
@@ -509,7 +520,10 @@ def biased_grouped_topk_impl(
             if num_fused_shared_experts == 0
             else topk_weights[:, :-1].sum(dim=-1, keepdim=True)
         )
-        topk_weights = topk_weights / topk_weights_sum
+        if apply_routed_scaling_factor_on_output:
+            topk_weights = topk_weights / topk_weights_sum * routed_scaling_factor
+        else:
+            topk_weights = topk_weights / topk_weights_sum
 
     topk_weights, topk_ids = topk_weights.to(torch.float32), topk_ids.to(torch.int32)
     topk_ids = topk_ids_logical_to_physical(topk_ids, expert_location_dispatch_info)
@@ -549,6 +563,7 @@ def biased_grouped_topk_gpu(
     num_expert_group: int = 0,
     topk_group: int = 0,
     num_fused_shared_experts: int = 0,
+    apply_routed_scaling_factor_on_output: bool = False,
     routed_scaling_factor: Optional[float] = None,
     num_token_non_padded: Optional[torch.Tensor] = None,
     expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
@@ -571,6 +586,7 @@ def biased_grouped_topk_gpu(
             topk,
             num_fused_shared_experts,
             routed_scaling_factor,
+            apply_routed_scaling_factor_on_output,
         )
         # TODO merge into kernel
         if (expert_location_dispatch_info is not None) or (
@@ -581,6 +597,7 @@ def biased_grouped_topk_gpu(
             )
         return topk_weights, topk_ids
     elif _use_aiter:
+        assert not apply_routed_scaling_factor_on_output, "Not implemented"
         token = gating_output.shape[0]
         device = gating_output.device
         assert (
@@ -610,6 +627,7 @@ def biased_grouped_topk_gpu(
             topk_group,
             num_fused_shared_experts=num_fused_shared_experts,
             routed_scaling_factor=routed_scaling_factor,
+            apply_routed_scaling_factor_on_output=apply_routed_scaling_factor_on_output,
             num_token_non_padded=num_token_non_padded,
             expert_location_dispatch_info=expert_location_dispatch_info,
         )
@@ -626,10 +644,12 @@ def biased_grouped_topk_cpu(
     compiled: bool = True,
     num_fused_shared_experts: int = 0,
     routed_scaling_factor: Optional[float] = None,
+    apply_routed_scaling_factor_on_output: Optional[bool] = False,
     num_token_non_padded: Optional[torch.Tensor] = None,
     expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
 ):
     assert expert_location_dispatch_info is None
+    assert not apply_routed_scaling_factor_on_output
     return torch.ops.sgl_kernel.biased_grouped_topk_cpu(
         hidden_states,
         gating_output,
@@ -669,6 +689,7 @@ def select_experts(
     correction_bias: Optional[torch.Tensor] = None,
     torch_native: bool = False,
     routed_scaling_factor: Optional[float] = None,
+    apply_routed_scaling_factor_on_output: Optional[bool] = False,
     num_token_non_padded: Optional[torch.Tensor] = None,
     expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
 ) -> TopKOutput:
@@ -694,6 +715,7 @@ def select_experts(
                 topk_group=topk_group,
                 num_fused_shared_experts=num_fused_shared_experts,
                 routed_scaling_factor=routed_scaling_factor,
+                apply_routed_scaling_factor_on_output=apply_routed_scaling_factor_on_output,
                 num_token_non_padded=num_token_non_padded,
                 expert_location_dispatch_info=expert_location_dispatch_info,
             )
@@ -707,6 +729,7 @@ def select_experts(
                 num_expert_group=num_expert_group,
                 topk_group=topk_group,
                 num_fused_shared_experts=num_fused_shared_experts,
+                apply_routed_scaling_factor_on_output=apply_routed_scaling_factor_on_output,
                 routed_scaling_factor=routed_scaling_factor,
                 num_token_non_padded=num_token_non_padded,
                 expert_location_dispatch_info=expert_location_dispatch_info,
@@ -716,13 +739,15 @@ def select_experts(
             num_token_non_padded is None
         ), "num_token_non_padded is not yet supported in fused_topk_native"
         assert expert_location_dispatch_info is None
+        assert not apply_routed_scaling_factor_on_output, "Not implemented"
         topk_weights, topk_ids = fused_topk_native(
             hidden_states=hidden_states,
             gating_output=router_logits,
             topk=top_k,
             renormalize=renormalize,
         )
     elif custom_routing_function is None:
+        assert not apply_routed_scaling_factor_on_output, "Not implemented"
         # Qwen3MOE uses fused_topk
         topk_weights, topk_ids = fused_topk(
             hidden_states=hidden_states,
@@ -737,6 +762,7 @@ def select_experts(
             num_token_non_padded is None
         ), "num_token_non_padded is not yet supported in custom_routing_function"
         assert expert_location_dispatch_info is None
+        assert not apply_routed_scaling_factor_on_output, "Not implemented"
         topk_weights, topk_ids = custom_routing_function(
             hidden_states=hidden_states,
             gating_output=router_logits,
diff --git a/python/sglang/srt/layers/quantization/modelopt_quant.py b/python/sglang/srt/layers/quantization/modelopt_quant.py
@@ -967,6 +967,7 @@ def apply(
         tp_rank: Optional[int] = None,
         tp_size: Optional[int] = None,
     ) -> torch.Tensor:
+        # Scale by routed_scaling_factor is fused into select_experts.
         assert activation == "silu", "Only SiLU activation is supported."
 
         if self.enable_flashinfer_cutlass_moe:
@@ -997,8 +998,6 @@ def apply(
                 tp_rank=tp_rank,
                 tune_max_num_tokens=next_power_of_2(x.shape[0]),
             )[0]
-            if routed_scaling_factor is not None:
-                output *= routed_scaling_factor
             return output
 
         from sglang.srt.layers.moe.cutlass_moe import cutlass_moe_fp4
@@ -1019,6 +1018,4 @@ def apply(
             params=layer.cutlass_moe_params,
             apply_router_weight_on_input=apply_router_weight_on_input,
         ).to(x.dtype)
-        if routed_scaling_factor is not None:
-            output *= routed_scaling_factor
         return output
diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py
@@ -62,6 +62,7 @@
     use_flashinfer_trtllm_moe,
 )
 from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPDispatcher
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
 from sglang.srt.layers.moe.topk import TopK
 from sglang.srt.layers.quantization import deep_gemm_wrapper
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
@@ -80,6 +81,7 @@
 from sglang.srt.layers.quantization.int8_utils import (
     block_dequant as int8_block_dequant,
 )
+from sglang.srt.layers.quantization.modelopt_quant import ModelOptNvFp4FusedMoEMethod
 from sglang.srt.layers.radix_attention import RadixAttention
 from sglang.srt.layers.rotary_embedding import get_rope, get_rope_wrapper
 from sglang.srt.layers.utils import is_sm100_supported
@@ -306,21 +308,6 @@ def __init__(
             config=config, prefix=add_prefix("gate", prefix), is_nextn=is_nextn
         )
 
-        self.topk = (
-            TopK(
-                top_k=config.num_experts_per_tok + self.num_fused_shared_experts,
-                renormalize=config.norm_topk_prob,
-                use_grouped_topk=True,
-                num_expert_group=config.n_group,
-                num_fused_shared_experts=self.num_fused_shared_experts,
-                topk_group=config.topk_group,
-                correction_bias=self.gate.e_score_correction_bias,
-                routed_scaling_factor=self.routed_scaling_factor,
-            )
-            if not use_flashinfer_trtllm_moe
-            else None
-        )
-
         self.experts = get_moe_impl_class()(
             num_experts=config.n_routed_experts
             + self.num_fused_shared_experts
@@ -360,6 +347,25 @@ def __init__(
             ),
         )
 
+        apply_routed_scaling_factor_on_output = isinstance(
+            self.experts, FusedMoE
+        ) and isinstance(self.experts.quant_method, ModelOptNvFp4FusedMoEMethod)
+        self.topk = (
+            TopK(
+                top_k=config.num_experts_per_tok + self.num_fused_shared_experts,
+                renormalize=config.norm_topk_prob,
+                use_grouped_topk=True,
+                num_expert_group=config.n_group,
+                num_fused_shared_experts=self.num_fused_shared_experts,
+                topk_group=config.topk_group,
+                correction_bias=self.gate.e_score_correction_bias,
+                routed_scaling_factor=self.routed_scaling_factor,
+                apply_routed_scaling_factor_on_output=apply_routed_scaling_factor_on_output,
+            )
+            if not use_flashinfer_trtllm_moe
+            else None
+        )
+
         self.shared_experts_is_int8 = False
         self.shared_experts_is_fp8 = False
         self.shared_experts_weight_block_size = None
diff --git a/sgl-kernel/csrc/common_extension.cc b/sgl-kernel/csrc/common_extension.cc
@@ -174,7 +174,7 @@ TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) {
 
   m.def(
       "moe_fused_gate(Tensor input, Tensor bias, int num_expert_group, int topk_group, int topk, int "
-      "num_fused_shared_experts, float routed_scaling_factor) -> "
+      "num_fused_shared_experts, float routed_scaling_factor, bool apply_routed_scaling_factor_on_output) -> "
       "(Tensor[])");
   m.impl("moe_fused_gate", torch::kCUDA, &moe_fused_gate);
   m.def(
diff --git a/sgl-kernel/csrc/moe/moe_fused_gate.cu b/sgl-kernel/csrc/moe/moe_fused_gate.cu
@@ -59,6 +59,7 @@ __device__ void moe_fused_gate_impl(
     int64_t topk,
     int64_t num_fused_shared_experts,
     double routed_scaling_factor,
+    bool apply_routed_scaling_factor_on_output,
     Params params) {
   int tidx = threadIdx.x;
   int64_t thread_row =
@@ -248,6 +249,9 @@ __device__ void moe_fused_gate_impl(
     for (int ii = 0; ii < topk; ++ii) {
       int64_t const idx = topk * thread_row + ii;
       output_ptr[idx] = output_ptr[idx] / output_sum;
+      if (apply_routed_scaling_factor_on_output) {
+        output_ptr[idx] *= routed_scaling_factor;
+      }
     }
   }
 }
@@ -282,7 +286,8 @@ __global__ void moe_fused_gate_kernel(
     int64_t topk_group,
     int64_t topk,
     int64_t num_fused_shared_experts,
-    double routed_scaling_factor) {
+    double routed_scaling_factor,
+    bool apply_routed_scaling_factor_on_output) {
   KernelParams<VPT, NUM_EXPERTS, THREADS_PER_ROW, ROWS_PER_WARP, ROWS_PER_CTA, WARPS_PER_CTA> params;
   moe_fused_gate_impl<T>(
       input,
@@ -294,6 +299,7 @@ __global__ void moe_fused_gate_kernel(
       topk,
       num_fused_shared_experts,
       routed_scaling_factor,
+      apply_routed_scaling_factor_on_output,
       params);
 }
 
@@ -314,7 +320,8 @@ __global__ void moe_fused_gate_kernel(
             topk_group,                                                                                  \
             topk,                                                                                        \
             num_fused_shared_experts,                                                                    \
-            routed_scaling_factor);                                                                      \
+            routed_scaling_factor,                                                                       \
+            apply_routed_scaling_factor_on_output);                                                      \
     dispatched = true;                                                                                   \
   } while (0)
 
@@ -342,7 +349,8 @@ __global__ void moe_fused_gate_kernel_dynamic(
     int64_t topk_group,
     int64_t topk,
     int64_t num_fused_shared_experts,
-    double routed_scaling_factor) {
+    double routed_scaling_factor,
+    bool apply_routed_scaling_factor_on_output) {
   KernelParamsDynamic params;
   params.NUM_EXPERTS = num_experts;             // e.g, for deepseek v3, this is 256
   params.VPT = num_experts / num_expert_group;  // e.g., for deepseek v3, this is 256 / 8 = 32
@@ -361,6 +369,7 @@ __global__ void moe_fused_gate_kernel_dynamic(
       topk,
       num_fused_shared_experts,
       routed_scaling_factor,
+      apply_routed_scaling_factor_on_output,
       params);
 }
 
@@ -374,7 +383,8 @@ std::vector<at::Tensor> moe_fused_gate(
     int64_t topk_group,
     int64_t topk,
     int64_t num_fused_shared_experts,
-    double routed_scaling_factor) {
+    double routed_scaling_factor,
+    bool apply_routed_scaling_factor_on_output) {
   int64_t num_rows = input.size(0);
   int32_t num_experts = input.size(1);
   auto options = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
@@ -473,7 +483,8 @@ std::vector<at::Tensor> moe_fused_gate(
           topk_group,
           topk,
           num_fused_shared_experts,
-          routed_scaling_factor);
+          routed_scaling_factor,
+          apply_routed_scaling_factor_on_output);
     } else if (input.scalar_type() == at::kHalf) {
       moe_fused_gate_kernel_dynamic<float16_t><<<num_blocks, block_dim, 0, stream>>>(
           input.data_ptr(),
@@ -486,7 +497,8 @@ std::vector<at::Tensor> moe_fused_gate(
           topk_group,
           topk,
           num_fused_shared_experts,
-          routed_scaling_factor);
+          routed_scaling_factor,
+          apply_routed_scaling_factor_on_output);
     } else if (input.scalar_type() == at::kFloat) {
       moe_fused_gate_kernel_dynamic<float32_t><<<num_blocks, block_dim, 0, stream>>>(
           input.data_ptr(),
@@ -499,7 +511,8 @@ std::vector<at::Tensor> moe_fused_gate(
           topk_group,
           topk,
           num_fused_shared_experts,
-          routed_scaling_factor);
+          routed_scaling_factor,
+          apply_routed_scaling_factor_on_output);
     } else {
       TORCH_CHECK(false, "Unsupported data type for moe_fused_gate");
     }
diff --git a/sgl-kernel/include/sgl_kernel_ops.h b/sgl-kernel/include/sgl_kernel_ops.h
@@ -244,7 +244,8 @@ std::vector<at::Tensor> moe_fused_gate(
     int64_t topk_group,
     int64_t topk,
     int64_t num_fused_shared_experts,
-    double routed_scaling_factor);
+    double routed_scaling_factor,
+    bool apply_routed_scaling_factor_on_output);
 
 void fp8_blockwise_scaled_grouped_mm(
     torch::Tensor& output,
diff --git a/sgl-kernel/python/sgl_kernel/moe.py b/sgl-kernel/python/sgl_kernel/moe.py
diff --git a/sgl-kernel/tests/test_moe_fused_gate.py b/sgl-kernel/tests/test_moe_fused_gate.py