File tree Expand file tree Collapse file tree 1 file changed +3
-2
lines changed Expand file tree Collapse file tree 1 file changed +3
-2
lines changed Original file line number Diff line number Diff line change @@ -485,7 +485,8 @@ void sm90_fp8_blockwise_group_mm_dispatch_shape(
485
485
torch::TensorOptions options_int = torch::TensorOptions().dtype(torch::kInt64 ).device(a.device());
486
486
torch::Tensor problem_sizes_transpose = torch::empty(num_experts * 3 , options_int);
487
487
488
- if (a.size(1 ) > 128 ) {
488
+ if (at::cuda::getCurrentDeviceProperties()->multiProcessorCount == 78 && a.size(1 ) > 128 ) {
489
+ // For H20 with K > 128, use Pingpong Schedule
489
490
run_get_group_gemm_starts<MmaConfig0::LayoutSFA, MmaConfig0::LayoutSFB, MmaConfig0::ScaleConfig>(
490
491
expert_offsets,
491
492
a_ptrs,
@@ -517,7 +518,7 @@ void sm90_fp8_blockwise_group_mm_dispatch_shape(
517
518
expert_offsets,
518
519
workspace);
519
520
} else {
520
- // Small K
521
+ // For H20 with K <= 128, and H100 & H200 & H800, use Cooperative Schedule
521
522
run_get_group_gemm_starts<MmaConfig1::LayoutSFA, MmaConfig1::LayoutSFB, MmaConfig1::ScaleConfig>(
522
523
expert_offsets,
523
524
a_ptrs,
You can’t perform that action at this time.
0 commit comments