Update LPLN arguments to match LayerNorm

nik-mosaic · nik-mosaic · commit ed3e8e9dbe9a · 2023-02-27T16:27:54.000-08:00
diff --git a/composer/algorithms/low_precision_layernorm/low_precision_layernorm.py b/composer/algorithms/low_precision_layernorm/low_precision_layernorm.py
@@ -42,7 +42,7 @@ def apply_low_precision_layernorm(model,
     if version.parse(torch.__version__) < version.parse('1.13') and precision == Precision.AMP_BF16:
         check_if_apex_installed()
         policy: Dict[Type[torch.nn.Module], module_surgery.ReplacementFunction] = {
-            torch.nn.LayerNorm: to_FusedLayerNorm
+            torch.nn.LayerNorm: _to_FusedLayerNorm
         }
 
     replaced_instances = module_surgery.replace_module_classes(module=model, optimizers=optimizers, policies=policy)
@@ -88,14 +88,12 @@ def apply(self, event: Event, state: State, logger: Logger) -> Optional[int]:
 
 class LPLayerNorm(torch.nn.LayerNorm):
 
-    def __init__(self, layer):
-        super().__init__(normalized_shape=layer.normalized_shape,
-                         eps=layer.eps,
-                         elementwise_affine=layer.elementwise_affine)
-
-        with torch.no_grad():
-            self.weight.copy_(layer.weight)
-            self.bias.copy_(layer.bias)
+    def __init__(self, normalized_shape, eps=1e-05, elementwise_affine=True, device=None, dtype=None):
+        super().__init__(normalized_shape=normalized_shape,
+                         eps=eps,
+                         elementwise_affine=elementwise_affine,
+                         device=device,
+                         dtype=dtype)
 
     def forward(self, x):
         module_device = x.device
@@ -106,27 +104,38 @@ def forward(self, x):
             return F.layer_norm(downcast_x, self.normalized_shape, downcast_weight, downcast_bias, self.eps)
 
 
-def _cast_if_autocast_enabled(hidden_states):
-    if not torch.is_autocast_enabled():
-        return hidden_states
-    else:
-        return torch.cuda.amp.autocast_mode._cast(hidden_states, torch.get_autocast_gpu_dtype())
+def _cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
 
 
 def check_if_apex_installed():
     if not APEX_INSTALLED:
         raise ImportError(
-            'https://github.com/NVIDIA/apex is not installed. The Low Precision LayerNorm algorithm cannot be applied. The MosaicML Docker Images (https://hub.docker.com/r/mosaicml/pytorch) contain a copy of APEX for easy use.'
+            'https://github.com/NVIDIA/apex is not installed. The Low Precision LayerNorm algorithm cannot be applied on PyTorch <1.13 without Apex. The MosaicML Docker Images (https://hub.docker.com/r/mosaicml/pytorch) contain a copy of APEX for easy use.'
         )
 
 
 def _to_LPLayerNorm(layer: torch.nn.Module, module_index: int) -> LPLayerNorm:
-    if not isinstance(layer, torch.nn.LayerNorm):
-        raise TypeError(f'Expected torch.nn.LayerNorm, got {type(layer)}')
-    return LPLayerNorm(layer)
+    """Defines a replacement policy from a `torch.nn.LayerNorm` to a `LPLayerNorm`"""
+    assert isinstance(layer,
+                      torch.nn.LayerNorm), 'The replacement policy will look for all instances of torch.nn.LayerNorm'
+    lp_layernorm = LPLayerNorm(layer.normalized_shape, layer.eps, layer.elementwise_affine, layer.weight.device,
+                               layer.weight.dtype)
+    with torch.no_grad():
+        lp_layernorm.weight.copy_(layer.weight)
+        lp_layernorm.bias.copy_(layer.bias)
+    return lp_layernorm
 
 
-def to_FusedLayerNorm(layer: torch.nn.Module, module_index: int) -> APEXFusedLayerNorm:
+def _to_FusedLayerNorm(layer: torch.nn.Module, module_index: int) -> APEXFusedLayerNorm:
     """Defines a replacement policy from a `torch.nn.LayerNorm` to a `apex.normalization.fused_layer_norm`"""
     if not isinstance(layer, torch.nn.LayerNorm):
         raise TypeError(f'Expected torch.nn.LayerNorm, got {type(layer)}')
diff --git a/tests/utils/test_inference.py b/tests/utils/test_inference.py
@@ -14,7 +14,7 @@
 import torch.nn as nn
 from torch.utils.data import DataLoader
 
-from composer.core import State
+from composer.core import Precision, State
 from composer.devices import DeviceCPU, DeviceGPU
 from composer.functional import apply_gated_linear_units
 from composer.loggers import InMemoryLogger, Logger
@@ -24,8 +24,8 @@
 from composer.trainer.trainer import Trainer
 from composer.utils import dist, export_with_logger, inference
 from composer.utils.device import get_device
-from tests.common import device
-from tests.common.datasets import RandomImageDataset
+from tests.common import SimpleTransformerClassifier, device
+from tests.common.datasets import RandomImageDataset, dummy_transformer_classifier_batch
 
 
 class MockFileUploader(LoggerDestination):
@@ -35,14 +35,13 @@ def can_upload_files(self) -> bool:
         return True
 
 
-@pytest.mark.parametrize(
-    'model_cls, sample_input',
-    [
-        (partial(composer_resnet, 'resnet18'), (torch.rand(4, 3, 224, 224), torch.randint(10, (4,)))),
-    ],
-)
+@pytest.mark.parametrize('model_cls, sample_input', [
+    (partial(composer_resnet, 'resnet18'), (torch.rand(4, 3, 224, 224), torch.randint(10, (4,)))),
+    (SimpleTransformerClassifier, dummy_transformer_classifier_batch(vocab_size=10)),
+])
 def test_export_for_inference_torchscript(model_cls, sample_input):
     model = model_cls()
+
     model.eval()
 
     orig_out = model(sample_input)
@@ -163,7 +162,7 @@ def test_gpu_huggingface_export_for_inference_onnx():
     import onnxruntime as ort
     import transformers
 
-    from composer.functional import apply_fused_layernorm
+    from composer.functional import apply_low_precision_layernorm
     from composer.models import HuggingFaceModel
 
     # HuggingFace Bert Model
@@ -203,7 +202,7 @@ def test_gpu_huggingface_export_for_inference_onnx():
 
     optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
     apply_gated_linear_units(model, optimizer)
-    apply_fused_layernorm(model, optimizer)
+    apply_low_precision_layernorm(model, optimizer, Precision('amp_fp16'))
 
     model.eval()
     orig_out = model(sample_input)