Skip to content

Commit 45d8d75

Browse files
HydraQYHlifuhuang
authored andcommitted
[Perf]Use Cooperative Schedule for H100 & H200 & H800 in fp8_blockwise_scaled_grouped_mm (#8722)
1 parent f4b97c3 commit 45d8d75

File tree

1 file changed

+3
-2
lines changed

1 file changed

+3
-2
lines changed

sgl-kernel/csrc/moe/fp8_blockwise_moe_kernel.cu

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -485,7 +485,8 @@ void sm90_fp8_blockwise_group_mm_dispatch_shape(
485485
torch::TensorOptions options_int = torch::TensorOptions().dtype(torch::kInt64).device(a.device());
486486
torch::Tensor problem_sizes_transpose = torch::empty(num_experts * 3, options_int);
487487

488-
if (a.size(1) > 128) {
488+
if (at::cuda::getCurrentDeviceProperties()->multiProcessorCount == 78 && a.size(1) > 128) {
489+
// For H20 with K > 128, use Pingpong Schedule
489490
run_get_group_gemm_starts<MmaConfig0::LayoutSFA, MmaConfig0::LayoutSFB, MmaConfig0::ScaleConfig>(
490491
expert_offsets,
491492
a_ptrs,
@@ -517,7 +518,7 @@ void sm90_fp8_blockwise_group_mm_dispatch_shape(
517518
expert_offsets,
518519
workspace);
519520
} else {
520-
// Small K
521+
// For H20 with K <= 128, and H100 & H200 & H800, use Cooperative Schedule
521522
run_get_group_gemm_starts<MmaConfig1::LayoutSFA, MmaConfig1::LayoutSFB, MmaConfig1::ScaleConfig>(
522523
expert_offsets,
523524
a_ptrs,

0 commit comments

Comments
 (0)