Fix cutlass_fused_experts_fp8

trevor-m · trevor-m · commit d4325ea035f8 · 2025-08-01T03:08:23.000Z
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
@@ -21,6 +21,7 @@
     QuantizationConfig,
     QuantizeMethodBase,
 )
+from sglang.srt.layers.quantization.fp8 import Fp8MoEMethod
 from sglang.srt.layers.quantization.modelopt_quant import ModelOptNvFp4FusedMoEMethod
 from sglang.srt.layers.quantization.unquant import UnquantizedFusedMoEMethod
 from sglang.srt.managers.schedule_batch import global_server_args_dict
@@ -690,4 +691,11 @@ def make_expert_input_scale_params_mapping(
         ]
 
     def should_fuse_routed_scaling_factor_in_topk(self):
-        return isinstance(self.quant_method, ModelOptNvFp4FusedMoEMethod)
+        if isinstance(self.quant_method, ModelOptNvFp4FusedMoEMethod):
+            return True
+        if (
+            isinstance(self.quant_method, Fp8MoEMethod)
+            and self.quant_method.should_use_cutlass_fused_experts_fp8()
+        ):
+            return True
+        return False
diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py
@@ -969,6 +969,14 @@ def process_weights_hip_scale_padding(self, layer: Module):
             )
             torch.cuda.empty_cache()
 
+    def should_use_cutlass_fused_experts_fp8(self):
+        return (
+            get_bool_env_var("SGLANG_CUTLASS_MOE")
+            and self.cutlass_fp8_supported
+            and self.block_quant
+            and is_sm100_supported()
+        )
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -1019,12 +1027,7 @@ def apply(
             if ret is not None:
                 return ret
 
-        if (
-            get_bool_env_var("SGLANG_CUTLASS_MOE")
-            and self.cutlass_fp8_supported
-            and self.block_quant
-            and is_sm100_supported()
-        ):
+        if self.should_use_cutlass_fused_experts_fp8():
             from sglang.srt.layers.moe.cutlass_moe import cutlass_fused_experts_fp8
 
             topk_weights, topk_ids, _ = topk_output