[PyTorch] Fix bug in FP8 cast in LayerNormLinear/LayerNormMLP (#738)

timmoon10 · web-flow · commit df1b16dae798 · 2024-03-28T21:49:01.000-07:00
Perform FP8 cast on gathered layernorm output in LayerNormLinear

Signed-off-by: Tim Moon &lt;tmoon@nvidia.com&gt;
diff --git a/transformer_engine/pytorch/cpp_extensions/gemm.py b/transformer_engine/pytorch/cpp_extensions/gemm.py
@@ -44,6 +44,8 @@ def fp8_gemm(
         assert fp8_meta_tensor is not None and out_index is not None
     assert_dim_for_fp8_exec(A)
     assert_dim_for_fp8_exec(B)
+    assert A.dtype == torch.uint8
+    assert B.dtype == torch.uint8
 
     if out is None:
         out = torch.empty(
diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -169,12 +169,19 @@ def forward(
                         out=ln_out_fp8)
                     ln_out = ln_out_fp8
                 else:
-                    ln_out = tex.cast_to_fp8(
-                        ln_out,
+                    ln_out_total = tex.cast_to_fp8(
+                        ln_out_total,
                         fp8_meta["scaling_fwd"],
                         tex.FP8FwdTensors.GEMM1_INPUT,
                         fp8_dtype_forward,
                     )
+                    if ln_out_gathered:
+                        rank = torch.distributed.get_rank(tp_group)
+                        slice_start = rank * ln_out.size(0)
+                        slice_end = (rank + 1) * ln_out.size(0)
+                        ln_out = ln_out_total[slice_start:slice_end, ...]
+                    else:
+                        ln_out = ln_out_total
 
         if fp8:
             bias_dtype = (
diff --git a/transformer_engine/pytorch/module/layernorm_mlp.py b/transformer_engine/pytorch/module/layernorm_mlp.py
@@ -187,12 +187,27 @@ def forward(
         if return_layernorm_output:
             ln_out_return = ln_out_total if return_layernorm_output_gathered else ln_out
             if fp8:
-                ln_out = tex.cast_to_fp8(
-                    ln_out,
-                    fp8_meta["scaling_fwd"],
-                    tex.FP8FwdTensors.GEMM1_INPUT,
-                    fp8_dtype_forward,
-                )
+                if ub_overlap_ag:
+                    ln_out = tex.cast_to_fp8(
+                        ln_out,
+                        fp8_meta["scaling_fwd"],
+                        tex.FP8FwdTensors.GEMM1_INPUT,
+                        fp8_dtype_forward,
+                    )
+                else:
+                    ln_out_total = tex.cast_to_fp8(
+                        ln_out_total,
+                        fp8_meta["scaling_fwd"],
+                        tex.FP8FwdTensors.GEMM1_INPUT,
+                        fp8_dtype_forward,
+                    )
+                    if ln_out_gathered:
+                        rank = torch.distributed.get_rank(tp_group)
+                        slice_start = rank * ln_out.size(0)
+                        slice_end = (rank + 1) * ln_out.size(0)
+                        ln_out = ln_out_total[slice_start:slice_end, ...]
+                    else:
+                        ln_out = ln_out_total
 
         if fp8:
             bias_dtype = (