vthumbe1503
diff --git a/‎docs/api/pytorch.rst‎
Lines changed: 4 additions & 1 deletion b/‎docs/api/pytorch.rst‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎docs/examples/onnx/onnx_export.ipynb‎
Lines changed: 1 addition & 1 deletion b/‎docs/examples/onnx/onnx_export.ipynb‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/pytorch/comm_gemm_overlap/te_layer_with_overlap.py‎
Lines changed: 3 additions & 1 deletion b/‎examples/pytorch/comm_gemm_overlap/te_layer_with_overlap.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎tests/cpp/CMakeLists.txt‎
Lines changed: 0 additions & 1 deletion b/‎tests/cpp/CMakeLists.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎tests/pytorch/distributed/run_layer_with_overlap.py‎
Lines changed: 65 additions & 12 deletions b/‎tests/pytorch/distributed/run_layer_with_overlap.py‎
Lines changed: 65 additions & 12 deletions
diff --git a/‎tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py‎
Lines changed: 7 additions & 1 deletion b/‎tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎tests/pytorch/test_fusible_ops.py‎
Lines changed: 40 additions & 16 deletions b/‎tests/pytorch/test_fusible_ops.py‎
Lines changed: 40 additions & 16 deletions
diff --git a/‎tests/pytorch/test_numerics.py‎
Lines changed: 13 additions & 4 deletions b/‎tests/pytorch/test_numerics.py‎
Lines changed: 13 additions & 4 deletions
diff --git a/‎tests/pytorch/test_onnx_export.py‎
Lines changed: 13 additions & 5 deletions b/‎tests/pytorch/test_onnx_export.py‎
Lines changed: 13 additions & 5 deletions
@@ -49,7 +49,7 @@ pyTorch
 
 .. autoapifunction:: transformer_engine.pytorch.moe_permute
 
-.. autoapifunction:: transformer_engine.pytorch.moe_permute_with_probs  
+.. autoapifunction:: transformer_engine.pytorch.moe_permute_with_probs
 
 .. autoapifunction:: transformer_engine.pytorch.moe_unpermute
 
@@ -62,3 +62,6 @@ pyTorch
 .. autoapifunction:: transformer_engine.pytorch.initialize_ub
 
 .. autoapifunction:: transformer_engine.pytorch.destroy_ub
+
+.. autoapiclass:: transformer_engine.pytorch.UserBufferQuantizationMode
+  :members: FP8, NONE
@@ -10,7 +10,7 @@
     "\n",
     "<b>Note:</b>\n",
     "\n",
-    "Currently, export to ONNX is supported only for high precision, FP8 delayed scaling and MXFP8.\n",
+    "Currently, export to ONNX is supported only for high precision, FP8 delayed scaling, FP8 current scaling and MXFP8.\n",
     "\n",
     "</div>\n",
     "\n",
 
@@ -263,7 +263,9 @@ def dist_print(msg, end="\n", group=nccl_world, src=0, debug=False, error=False)
         te.module.base.initialize_ub(
             [batched_size, hidden_size],
             tp_size,
-            use_fp8=opts.fp8,
+            quantization_modes=[
+                UserBufferQuantizationMode.FP8 if opts.fp8 else UserBufferQuantizationMode.NONE
+            ],
             dtype=torch.bfloat16,
             bootstrap_backend=opts.bootstrap_backend,
         )
 
@@ -43,6 +43,5 @@ include_directories(${CMAKE_SOURCE_DIR})
 find_package(CUDAToolkit REQUIRED)
 include(${CMAKE_SOURCE_DIR}/../../3rdparty/cudnn-frontend/cmake/cuDNN.cmake)
 
-add_subdirectory(comm_gemm)
 add_subdirectory(operator)
 add_subdirectory(util)
@@ -12,6 +12,8 @@
 import warnings
 import pprint
 import yaml
+from contextlib import nullcontext
+from functools import partial
 
 import torch
 import torch.distributed as dist
@@ -35,9 +37,10 @@ def __init__(self, module, num_layers, *args, **kwargs):
         self.num_layers = num_layers
         self.layers = torch.nn.ModuleList([module(*args, **kwargs) for _ in range(num_layers)])
 
-    def forward(self, x):
-        for layer in self.layers:
-            x = layer(x)
+    def forward(self, x, layer_contexts):
+        for layer, context in zip(self.layers, layer_contexts):
+            with context():
+                x = layer(x)
         return x
 
 
@@ -237,12 +240,46 @@ def _parse_args(argv=None, namespace=None):
         default=False,
         help="Print out additional debug information.",
     )
+    parser.add_argument(
+        "--first-last-layers-bf16",
+        action="store_true",
+        default=False,
+        help="Use bf16 for first and last N layers.",
+    )
+    parser.add_argument(
+        "--num-layers-at-start-in-bf16",
+        type=int,
+        default=0,
+        help="Number of layers at the start to run in bf16.",
+    )
+    parser.add_argument(
+        "--num-layers-at-end-in-bf16",
+        type=int,
+        default=0,
+        help="Number of layers at the end to run in bf16.",
+    )
     args = parser.parse_args(argv, namespace)
 
     if args.use_cuda_graphs and args.layer_type in [te.MultiheadAttention, te.TransformerLayer]:
         warnings.warn(f"{args.layer_type.__name__} does not support CUDA Graphs!")
         args.use_cuda_graphs = False
 
+    if not args.first_last_layers_bf16 and (
+        args.num_layers_at_start_in_bf16 > 0 or args.num_layers_at_end_in_bf16 > 0
+    ):
+        warnings.warn(
+            "num-layers-at-start-in-bf16 and num-layers-at-end-in-bf16 are only supported when"
+            " first-last-layers-bf16 is enabled!"
+        )
+        args.num_layers_at_start_in_bf16 = 0
+        args.num_layers_at_end_in_bf16 = 0
+
+    if args.num_layers_at_start_in_bf16 + args.num_layers_at_end_in_bf16 > args.num_layers:
+        raise ValueError(
+            "num-layers-at-start-in-bf16 + num-layers-at-end-in-bf16 must be less than or equal to"
+            " num-layers!"
+        )
+
     return args
 
 
@@ -381,10 +418,17 @@ def dist_print(msg, src=None, end="\n", debug=False, error=False):
             "qkv_dgrad": {"method": "ring_exchange"},
             "fc1_dgrad": {"method": "ring_exchange"},
         }
+
+    quantization_modes = [
+        UserBufferQuantizationMode.FP8 if opts.fp8 else UserBufferQuantizationMode.NONE
+    ]
+    if opts.first_last_layers_bf16 and opts.fp8:
+        quantization_modes.append(UserBufferQuantizationMode.NONE)
+
     te.module.base.initialize_ub(
         [opts.seq_length * opts.batch_size, opts.num_heads * opts.head_dim],
         opts.tp,
-        use_fp8=opts.fp8,
+        quantization_modes=quantization_modes,
         dtype=torch.bfloat16,
         bootstrap_backend=opts.bootstrap_backend,
         ub_cfgs=ub_cfgs if opts.ub_cfg is None else opts.ub_cfg,
@@ -423,6 +467,16 @@ def dist_print(msg, src=None, end="\n", debug=False, error=False):
     elif opts.quantization == "mxfp8":
         fp8_recipe = MXFP8BlockScaling()
 
+    layer_contexts = [
+        (
+            partial(te.fp8_autocast, enabled=opts.fp8, fp8_recipe=fp8_recipe, fp8_group=nccl_world)
+            if opts.num_layers_at_start_in_bf16 <= i
+            and i < (opts.num_layers - opts.num_layers_at_end_in_bf16)
+            else nullcontext
+        )
+        for i in range(opts.num_layers)
+    ]
+
     # Prepare random input tensors
     test_x = torch.randn(input_shape, dtype=torch.float32, device="cuda", requires_grad=True)
     test_x.retain_grad()
@@ -435,14 +489,13 @@ def dist_print(msg, src=None, end="\n", debug=False, error=False):
     # Execute fwd/bwd and collect tensors to test
     def run_fwd_bwd(model, x):
         with torch.amp.autocast("cuda", dtype=torch.bfloat16):
-            with te.fp8_autocast(enabled=opts.fp8, fp8_recipe=fp8_recipe, fp8_group=nccl_world):
-                y = model(x)
-                if isinstance(y, tuple):
-                    out, *_ = y
-                else:
-                    out = y
-                loss = out.sum()
-                loss.backward()
+            y = model(x, layer_contexts)
+            if isinstance(y, tuple):
+                out, *_ = y
+            else:
+                out = y
+            loss = out.sum()
+            loss.backward()
         return out
 
     torch_rng_state = torch.get_rng_state()
 
@@ -506,7 +506,13 @@ def main() -> None:
                 model_config.num_heads * model_config.head_dim,
             ],
             torch.distributed.get_world_size(group),
-            use_fp8=model_config.quantization is not None,
+            quantization_modes=[
+                (
+                    UserBufferQuantizationMode.FP8
+                    if model_config.quantization is not None
+                    else UserBufferQuantizationMode.NONE
+                )
+            ],
             dtype=model_config.dtype,
             bootstrap_backend=bootstrap_backend,
             ub_cfgs=userbuffer_configs,
 
@@ -1749,43 +1749,65 @@ def test_constant_scale(
         torch.testing.assert_close(y_test, y_ref, **tols)
         torch.testing.assert_close(dx_test, x_ref.grad, **tols)
 
-    @pytest.mark.parametrize("prob", (0.1, 0.5, 0.75))
+    @pytest.mark.parametrize("prob", (0.0625, 0.5, 0.75))
     @pytest.mark.parametrize("is_training", (True, False))
-    @pytest.mark.parametrize("shape", ((101,), (2, 4, 16)))
+    @pytest.mark.parametrize("quantization", (None, "fp8_current_scaling"))
+    @pytest.mark.parametrize("shape", ((101,), (2, 4, 16), (128, 128)))
     @pytest.mark.parametrize("dtype", _dtypes)
     def test_dropout(
         self,
         *,
         prob: float,
         is_training: bool,
+        quantization: Optional[str],
         shape: Iterable[int],
         dtype: torch.dtype,
         device: torch.device = "cuda",
     ):
 
+        # Skip invalid configurations
+        quantized_input = quantization is not None
+        maybe_skip_quantization(quantization, dims=shape, device=device)
+
         # Random data
-        x_ref = torch.rand(shape, dtype=dtype, device=device) + 0.5
-        x_test = x_ref.clone().requires_grad_()
-        dy_ref = torch.rand(shape, dtype=dtype, device=device) + 0.5
-        dy_test = dy_ref.clone()
+        # Note: Shift values to make sure inputs are non-zero
+        x_ref, x_test = make_reference_and_test_tensors(
+            shape,
+            quantization=quantization,
+            test_dtype=dtype,
+            test_device=device,
+            test_is_quantized=quantized_input,
+        )
+        with torch.no_grad():
+            x_test += 1
+            x_ref.copy_(x_test)
+        dy_ref, dy_test = make_reference_and_test_tensors(
+            shape,
+            test_dtype=dtype,
+            test_device=device,
+            requires_grad=False,
+        )
 
         # Apply dropout
         op = te_ops.Dropout(prob)
         if is_training:
             op.train()
         else:
             op.eval()
-        y = op(x_test)
-        y.backward(dy_test)
+        y_test = op(x_test)
+        y_test.backward(dy_test)
 
         # Check values
+        y_test = y_test.to(dtype=torch.float64, device="cpu")
+        dx_test = x_test.grad.to(dtype=torch.float64, device="cpu")
         if is_training:
-            mask = ((y != 0) / (1 - prob)).to(dtype=dtype)
-            torch.testing.assert_close(y, x_ref * mask)
-            torch.testing.assert_close(x_test.grad, dy_ref * mask)
+            tols = dtype_tols(dtype)
+            mask = ((y_test != 0) / (1 - prob)).to(dtype=dtype)
+            torch.testing.assert_close(y_test, x_ref * mask, **tols)
+            torch.testing.assert_close(dx_test, dy_ref * mask, **tols)
         else:
-            torch.testing.assert_close(y, x_ref, rtol=0, atol=0)
-            torch.testing.assert_close(x_test.grad, dy_ref, rtol=0, atol=0)
+            torch.testing.assert_close(y_test, x_ref, rtol=0, atol=0)
+            torch.testing.assert_close(dx_test, dy_ref, rtol=0, atol=0)
 
         # Hypothesis testing for number of zeros
         # Note: A Bernoulli random variable with probability p has
@@ -1797,9 +1819,11 @@ def test_dropout(
         # p-value is less than 1% and we assume that the dropout
         # distribution is incorrect.
         if is_training:
-            prob_observed = 1 - torch.count_nonzero(y).item() / y.numel()
-            z_score = (prob_observed - prob) / math.sqrt(prob * (1 - prob) / y.numel())
-            assert abs(z_score) < 2.5758, "Number of zeros is outside 99% confidence interval"
+            prob_observed = 1 - torch.count_nonzero(y_test).item() / y_test.numel()
+            z_score = (prob_observed - prob) / math.sqrt(prob * (1 - prob) / y_test.numel())
+            assert (
+                abs(z_score) < 2.5758
+            ), f"Number of zeros is outside 99% confidence interval ({prob=}, {prob_observed=})"
 
 
 class TestFusedOps:
 
@@ -122,13 +122,18 @@
 
 
 def is_fused_attn_available(
-    config: ModelConfig, dtype: torch.dtype, qkv_layout="bshd_bshd_bshd", is_training=True
+    config: ModelConfig,
+    dtype: torch.dtype,
+    qkv_layout="bshd_bshd_bshd",
+    is_training=True,
+    deterministic=False,
 ):
     _, _, fused_attn_backends = get_available_attention_backends(
         config,
         qkv_dtype=dtype,
         qkv_layout=qkv_layout,
         is_training=is_training,
+        deterministic=deterministic,
     )
     return FusedAttnBackend["F16_arbitrary_seqlen"] in fused_attn_backends
 
@@ -839,7 +844,7 @@ def _test_e2e_checkpointing(bs, dtype, config, checkpoint=False, steps=10, path=
 @pytest.mark.parametrize("model", ["126m"])
 def test_gpt_checkpointing(dtype, bs, model):
     config = model_configs[model]
-    if not is_fused_attn_available(config, dtype):
+    if not is_fused_attn_available(config, dtype, deterministic=True):
         pytest.skip("No attention backend available.")
     outputs = _test_e2e_checkpointing(bs, dtype, config, checkpoint=False)
     outputs_checkpoint = _test_e2e_checkpointing(bs, dtype, config, checkpoint=True)
@@ -887,7 +892,9 @@ def _test_e2e_gpt_accuracy(block, bs, dtype, config):
 @pytest.mark.parametrize("parallel_attention_mlp", all_boolean)
 def test_gpt_accuracy(dtype, bs, model, parallel_attention_mlp):
     config = model_configs[model]
-    if not is_fused_attn_available(config, dtype, qkv_layout="sb3hd", is_training=False):
+    if not is_fused_attn_available(
+        config, dtype, qkv_layout="sb3hd", is_training=True, deterministic=True
+    ):
         pytest.skip("No attention backend available.")
 
     te_gpt = TransformerLayer(
@@ -1000,7 +1007,9 @@ def _test_mha_accuracy(block, bs, dtype, config, mask_type, te=True):
 @pytest.mark.parametrize("mask_type", mask_types)
 def test_mha_accuracy(dtype, bs, model, mask_type):
     config = model_configs[model]
-    if not is_fused_attn_available(config, dtype, qkv_layout="sb3hd", is_training=False):
+    if not is_fused_attn_available(
+        config, dtype, qkv_layout="sb3hd", is_training=True, deterministic=True
+    ):
         pytest.skip("No attention backend available.")
 
     te_mha = MultiheadAttention(
 
@@ -65,6 +65,7 @@
     fp8_recipes.append(recipe.MXFP8BlockScaling())
 if fp8_available:
     fp8_recipes.append(recipe.DelayedScaling())
+    fp8_recipes.append(recipe.Float8CurrentScaling())
 fp8_recipes.append(None)
 
 supported_activations = ["gelu", "relu", "reglu", "geglu", "swiglu"]
@@ -81,11 +82,11 @@
     ],
     outputs=[PyCustomOpDef.dt_uint8],
 )
-def trt_fp8_quantize(t, scale):
+def trt_fp8_quantize(t, scale_inv):
     """FP8 quantization extension for ONNX Runtime."""
     x = torch.from_numpy(t).cuda()
     q = te.tensor.float8_tensor.Float8Quantizer(
-        scale=1 / torch.from_numpy(scale).cuda(),
+        scale=1 / torch.from_numpy(scale_inv).cuda(),
         amax=torch.zeros([1]).cuda(),
         fp8_dtype=tex.DType.kFloat8E4M3,
     )
@@ -101,11 +102,11 @@ def trt_fp8_quantize(t, scale):
     ],
     outputs=[PyCustomOpDef.dt_float],
 )
-def trt_fp8_dequantize(t, scale):
+def trt_fp8_dequantize(t, scale_inv):
     """FP8 dequantization extension for ONNX Runtime."""
     x = torch.from_numpy(t).cuda()
     q = te.tensor.float8_tensor.Float8Quantizer(
-        scale=1 / torch.from_numpy(scale).cuda(),
+        scale=1 / torch.from_numpy(scale_inv).cuda(),
         amax=torch.zeros([1]).cuda(),
         fp8_dtype=tex.DType.kFloat8E4M3,
     )
@@ -593,7 +594,9 @@ def _test_export_layernorm_linear(
                     fname,
                     inp,
                     model,
-                    atol=1e-3,
+                    # For current scaling we use Float8Quantizer in tests + amax computed by hand,
+                    # which has slightly different numerics than Float8CurrentScalingQuantizer.
+                    atol=1e-3 if fp8_recipe.__class__ is not recipe.Float8CurrentScaling else 2e-2,
                     is_fp8=fp8_recipe is not None,
                     te_outputs=te_outputs,
                 )
@@ -1150,6 +1153,11 @@ def test_trt_integration(fp8_recipe: recipe.Recipe):
         ffn_hidden_size=128,
         num_attention_heads=4,
     ).eval()
+
+    if type(fp8_recipe) == recipe.Float8CurrentScaling:
+        # TODO(pgadzinski): Attention does not work with TRT for FP8CurrentScaling
+        model = te.LayerNormMLP(128, 128)
+
     inps = (torch.randn([16, 16, 128], device="cuda", requires_grad=False),)
 
     with te.fp8_autocast(enabled=fp8_recipe is not None, fp8_recipe=fp8_recipe):