sgl-project · zhyncs · Jul 28, 2025 · Jul 28, 2025
@@ -800,11 +800,6 @@ def __init__(
             routed_scaling_factor=routed_scaling_factor,
         )
         self.deepep_mode = deepep_mode
-        if deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM:
-            assert self.use_fp8_w8a8, (
-                "DeepGEMM requires an fp8_w8a8 model; "
-                "alternatively, you can disable DeepGEMM by turning off the ENABLE_JIT_DEEPGEMM environment variable."
-            )
 
         # TODO: move to the beginning of the file
         from sglang.srt.distributed.parallel_state import get_tp_group
@@ -897,7 +892,7 @@ def moe_impl(self, dispatch_output: DispatchOutput):
             # in forward_aiter, we skip token permutation and unpermutation, which have been fused inside aiter kernel
             return self.forward_aiter(dispatch_output)
         if dispatch_output.format.is_deepep_normal():
-            if deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM:
+            if deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM and self.use_fp8_w8a8:
                 return self.forward_deepgemm_contiguous(dispatch_output)
             else:
                 return self.forward_normal(dispatch_output)