File tree Expand file tree Collapse file tree 2 files changed +18
-7
lines changed Expand file tree Collapse file tree 2 files changed +18
-7
lines changed Original file line number Diff line number Diff line change 21
21
QuantizationConfig ,
22
22
QuantizeMethodBase ,
23
23
)
24
+ from sglang .srt .layers .quantization .fp8 import Fp8MoEMethod
24
25
from sglang .srt .layers .quantization .modelopt_quant import ModelOptNvFp4FusedMoEMethod
25
26
from sglang .srt .layers .quantization .unquant import UnquantizedFusedMoEMethod
26
27
from sglang .srt .managers .schedule_batch import global_server_args_dict
@@ -690,4 +691,11 @@ def make_expert_input_scale_params_mapping(
690
691
]
691
692
692
693
def should_fuse_routed_scaling_factor_in_topk (self ):
693
- return isinstance (self .quant_method , ModelOptNvFp4FusedMoEMethod )
694
+ if isinstance (self .quant_method , ModelOptNvFp4FusedMoEMethod ):
695
+ return True
696
+ if (
697
+ isinstance (self .quant_method , Fp8MoEMethod )
698
+ and self .quant_method .should_use_cutlass_fused_experts_fp8 ()
699
+ ):
700
+ return True
701
+ return False
Original file line number Diff line number Diff line change @@ -969,6 +969,14 @@ def process_weights_hip_scale_padding(self, layer: Module):
969
969
)
970
970
torch .cuda .empty_cache ()
971
971
972
+ def should_use_cutlass_fused_experts_fp8 (self ):
973
+ return (
974
+ get_bool_env_var ("SGLANG_CUTLASS_MOE" )
975
+ and self .cutlass_fp8_supported
976
+ and self .block_quant
977
+ and is_sm100_supported ()
978
+ )
979
+
972
980
def apply (
973
981
self ,
974
982
layer : torch .nn .Module ,
@@ -1019,12 +1027,7 @@ def apply(
1019
1027
if ret is not None :
1020
1028
return ret
1021
1029
1022
- if (
1023
- get_bool_env_var ("SGLANG_CUTLASS_MOE" )
1024
- and self .cutlass_fp8_supported
1025
- and self .block_quant
1026
- and is_sm100_supported ()
1027
- ):
1030
+ if self .should_use_cutlass_fused_experts_fp8 ():
1028
1031
from sglang .srt .layers .moe .cutlass_moe import cutlass_fused_experts_fp8
1029
1032
1030
1033
topk_weights , topk_ids , _ = topk_output
You can’t perform that action at this time.
0 commit comments