diff --git a/tests/pytorch/debug/test_api_features.py b/tests/pytorch/debug/test_api_features.py index 974772599a..d28db16477 100644 --- a/tests/pytorch/debug/test_api_features.py +++ b/tests/pytorch/debug/test_api_features.py @@ -268,7 +268,7 @@ def assert_empty(): )[0] expected_underflows = ( - ((tensor_fp8._data == 0).sum() - (tensor == 0).sum()) * 100 / (100 * 100 * 5) + ((tensor_fp8.dequantize() == 0).sum() - (tensor == 0).sum()) * 100 / (100 * 100 * 5) ) assert debug_api.transformer_engine.inspect_tensor_enabled( @@ -302,7 +302,7 @@ def assert_empty(): )[0] # Second config in same yaml - tensor = torch.rand((100, 100, 5)) + tensor = torch.rand((100, 100, 5)).cuda() debug_api.transformer_engine.inspect_tensor( "decoder.6.mlp.fc1", tensor_name="activation", @@ -316,7 +316,9 @@ def assert_empty(): stats = log() stats_names = [x[3] for x in stats.keys()] all(s in stats_names for s in ["cur_amax", "dynamic_range", "mean", "std", "l1_norm"]) - assert stats[("decoder.6.mlp.fc1", "activation", "mean", 200)] == tensor.mean() + torch.testing.assert_close( + stats[("decoder.6.mlp.fc1", "activation", "mean", 200)], tensor.mean() + ) debug_api.transformer_engine.inspect_tensor( "decoder.7.mlp.fc1", @@ -331,7 +333,7 @@ def assert_empty(): stats = log() stats_names = [x[3] for x in stats.keys()] all(s in stats_names for s in ["mean", "std", "l1_norm", "min", "max"]) - assert stats[("decoder.7.mlp.fc1", "weight", "max", 200)] == tensor.max() + torch.testing.assert_close(stats[("decoder.7.mlp.fc1", "weight", "max", 200)], tensor.max()) assert not debug_api.transformer_engine.inspect_tensor_enabled( "decoder.7.mlp.fc1", tensor_name="weight", iteration=201 @@ -377,7 +379,7 @@ def fp8_tensor(t): return quantizer(t.cuda()) shape = [1024, 1024] - tensors = [torch.randn(shape) for _ in range(2)] + tensors = [torch.randn(shape).cuda() for _ in range(2)] tensors_fp8 = [fp8_tensor(tensors[i]) for i in range(2)] feed(tensors[0], tensors_fp8[0], quantizer) diff --git a/tests/pytorch/debug/test_log.py b/tests/pytorch/debug/test_log.py index ca8e10ad69..dcc9861c84 100644 --- a/tests/pytorch/debug/test_log.py +++ b/tests/pytorch/debug/test_log.py @@ -167,8 +167,8 @@ def test_numerics(fp8_recipe, feature_dirs): num_quantizers=3, ) - tensor = torch.zeros(1024, 1024).cuda() - tensor[0, :] = 1000 + tensor = torch.randn(1024, 1024).cuda() + tensor[0, 100:200] = -0.0 quantizer = recipe_state.make_quantizers()[0] quantized_tensor = quantizer(tensor) @@ -191,15 +191,13 @@ def test_numerics(fp8_recipe, feature_dirs): if "underflows%" in line: underflows = float(line.split("value=")[1]) expected = ( - ((dequantized_tensor == 0).sum() - (tensor == 0).sum()) - / dequantized_tensor.numel() - * 100 + ((dequantized_tensor == 0).sum() - (tensor == 0).sum()) / tensor.numel() * 100 ) assert underflows == pytest.approx(expected.cpu(), abs=1e-4) if "mse" in line: mse = float(line.split("value=")[1]) expected = torch.nn.functional.mse_loss(dequantized_tensor, tensor, reduction="mean") - assert mse == pytest.approx(expected.cpu(), abs=1e-6) + assert mse == pytest.approx(expected.cpu(), abs=1e-4) if "overflows%" in line: overflows = float(line.split("value=")[1]) expected = ( diff --git a/transformer_engine/debug/features/utils/stats_computation.py b/transformer_engine/debug/features/utils/stats_computation.py index 3842ab1c56..2fa6985acf 100644 --- a/transformer_engine/debug/features/utils/stats_computation.py +++ b/transformer_engine/debug/features/utils/stats_computation.py @@ -199,6 +199,15 @@ def _get(buffers, stat_name): ), } +FP8_NEGATIVE_ZERO = 128 # represnts -0.0 in fp8 + + +def count_nonzero_fp8(fp8_data: torch.Tensor) -> torch.Tensor: + """Count the number of non-zero elements in the fp8 data.""" + fp8_data = fp8_data.view(dtype=torch.uint8) + zero_vals = torch.tensor([0, FP8_NEGATIVE_ZERO], device=fp8_data.device, dtype=torch.uint8) + return fp8_data.numel() - torch.isin(fp8_data, zero_vals).sum() + def add_underflows_stats(recipe_name: str, columnwise: bool = False): """Register *both* underflow stats (num and %) for the given recipe.""" @@ -212,22 +221,23 @@ def add_underflows_stats(recipe_name: str, columnwise: bool = False): stats_to_num[stat_pct] = len(stats_to_num) STATS[stat_num] = ( - lambda x, aux_dict: ( + lambda x, aux_dict: x.count_nonzero() + - count_nonzero_fp8( aux_dict[recipe_name].get_data_tensors( rowwise_data=not columnwise, columnwise_data=columnwise ) - == 0 - ).sum() - - (x == 0).sum(), + ), lambda buffers, _sn=stat_num: sum(_get(buffers, _sn)), ) STATS[stat_pct] = ( lambda x, aux_dict: ( - aux_dict[recipe_name].get_data_tensors( - rowwise_data=not columnwise, columnwise_data=columnwise + x.count_nonzero() + - count_nonzero_fp8( + aux_dict[recipe_name].get_data_tensors( + rowwise_data=not columnwise, columnwise_data=columnwise + ) ) - == 0 - ).sum() + ) / aux_dict[recipe_name].numel() * 100, lambda buffers, _sn_num=stat_num: 100