Activation Checkpointing and Offloading for FSDP2 (#3832)

rithwik-db · web-flow · commit 570fd2e37b8c · 2025-05-01T19:11:00.000-07:00
diff --git a/composer/distributed/activation_checkpointing.py b/composer/distributed/activation_checkpointing.py
@@ -0,0 +1,70 @@
+# Copyright 2022 MosaicML Composer authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Helpers for activation checkpointing. Note that while this is orthogonal to FSDP2, it is implemented in the distributed directory because it is closely related to FSDP2."""
+
+from typing import Callable, Optional
+
+import torch
+import torch.nn as nn
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    CheckpointImpl,
+    apply_activation_checkpointing,
+    checkpoint_wrapper,
+    offload_wrapper,
+)
+
+
+def generate_default_check_fn(model: nn.Module) -> Callable:
+    """Generates the default check fn for activation checkpointing/offloading."""
+
+    def _check_fn(module: torch.nn.Module) -> bool:
+        if hasattr(module, '_activation_checkpointing'):
+            return bool(module._activation_checkpointing)
+        if hasattr(
+            model,
+            'activation_checkpointing_fn',
+        ) and isinstance(model.activation_checkpointing_fn, Callable):
+            return model.activation_checkpointing_fn(module)
+        return False
+
+    return _check_fn
+
+
+def apply_ac(
+    model: nn.Module,
+    activation_checkpointing: bool,
+    activation_cpu_offload: bool,
+    check_fn: Optional[Callable] = None,
+) -> None:
+    """Apply activation checkpointing to the model. This is orthogonal to FSDP2 so it can be applied pre-sharding or post-sharding.
+
+    This method follows the same logic as FSDP1 as well as TorchTitan's AC example.
+
+    Args:
+        model (nn.Module): The model to apply activation checkpointing to.
+        activation_checkpointing (bool): Whether to apply activation checkpointing.
+        activation_cpu_offload (bool): Whether to offload activations to the CPU.
+        check_fn (Optional[Callable]): An optional function to determine if a module should be checkpointed.
+    """
+    # Create the base checkpointing wrapper using no_reentrant checkpointing by default as
+    # PyTorch notes that reentrant checkpointing is deprecated and will be removed in a future release
+    opt_checkpoint_wrapper = lambda m: checkpoint_wrapper(
+        m,
+        checkpoint_impl=CheckpointImpl.NO_REENTRANT,
+    ) if activation_checkpointing else (lambda module: module)
+    # Create the combined wrapper which takes cpu offloading into consideration
+    opt_combined_wrapper = (
+        lambda module: offload_wrapper(
+            opt_checkpoint_wrapper(module)
+            if activation_checkpointing else module,  # type: ignore reportGeneralTypeIssues
+        )
+    ) if activation_cpu_offload else opt_checkpoint_wrapper
+
+    # Create the check function to determine if a module should be checkpointed
+    if check_fn is None:
+        check_fn = generate_default_check_fn(model)
+
+    # Apply the activation checkpointing on the model, this uses _recursive_wrap to apply the wrapper to all submodules
+    # but doesn't apply the wrapper to the root module
+    apply_activation_checkpointing(model, opt_combined_wrapper, check_fn)  # type: ignore
diff --git a/composer/distributed/prepare_distributed.py b/composer/distributed/prepare_distributed.py
@@ -0,0 +1,49 @@
+# Copyright 2022 MosaicML Composer authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Entrypoint for distributed training (using FSDP2)."""
+
+from typing import Callable, Optional
+
+import torch
+from torch.distributed.fsdp.wrap import CustomPolicy
+
+from composer.distributed.activation_checkpointing import apply_ac
+from composer.distributed.fsdp2 import prepare_fully_shard
+from composer.utils.parallelism import FSDP2Config, FSDPConfig
+
+
+def parallelize_model(
+    model: torch.nn.Module,
+    config: FSDP2Config | FSDPConfig,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    fsdp_wrap_policy: Optional[CustomPolicy] = None,
+    activation_checkpointing_check_fn: Optional[Callable] = None,
+):
+    """Prepare a model for distributed training.
+
+    Args:
+        model (torch.nn.Module): The model to prepare for distributed training.
+        config (FSDP2Config | FSDPConfig): The configuration for distributed training. Currently only FSDP2Config is supported.
+        optimizer (Optional[torch.optim.Optimizer]): The optimizer to use for distributed training.
+        fsdp_wrap_policy (Optional[CustomPolicy]): The FSDP wrap policy to use for distributed training.
+        activation_checkpointing_check_fn (Optional[Callable]): The function to use to check if a module's activations should be checkpointed or offloaded.
+    """
+    if isinstance(config, FSDPConfig):
+        raise ValueError('FSDPConfig is not supported for now, use FSDP2Config instead')
+
+    if activation_checkpointing_check_fn is not None:
+        if not config.activation_checkpointing and not config.activation_cpu_offload:
+            raise ValueError(
+                'Activation checkpointing or offloading must be enabled if activation_checkpointing_check_fn is provided',
+            )
+
+    if config.activation_checkpointing or config.activation_cpu_offload:
+        apply_ac(
+            model,
+            config.activation_checkpointing,
+            config.activation_cpu_offload,
+            activation_checkpointing_check_fn,
+        )
+
+    prepare_fully_shard(model, optimizer, config, fsdp_wrap_policy)
diff --git a/composer/utils/parallelism.py b/composer/utils/parallelism.py
@@ -76,6 +76,10 @@ class FSDP2Config:
     # Settable core FSDP2 attrs
     device_mesh: Optional[DeviceMesh] = None
     reshard_after_forward: bool | int = True
+    # TODO: If we have reasonable evidence that activation checkpointing/activation offloading is decoupled from FSDP(2)
+    #       in most of our use cases, we can decouple these two attributes from the FSDP2Config class.
+    activation_checkpointing: bool = False
+    activation_cpu_offload: bool = False
 
     ### Temporary read-only properties for FSDP 1 compatibility  ###
     # to be supported in FSDP2
@@ -103,10 +107,6 @@ def save_planner(self) -> Optional[Any]:
     def sharded_ckpt_prefix_dir(self) -> str:
         return 'ep{epoch}-ba{batch}'
 
-    @property
-    def activation_cpu_offload(self) -> bool:
-        return False
-
     @property
     def data_parallel_shard_degree(self) -> int:
         return -1
diff --git a/tests/common/__init__.py b/tests/common/__init__.py
@@ -18,7 +18,9 @@
 from tests.common.events import EventCounterCallback
 from tests.common.markers import device, world_size
 from tests.common.models import (
+    ComposerCounterModel,
     ConvModel,
+    CountModule,
     EmbeddedWeightTiedModel,
     EmptyModel,
     EvenSimplerMLP,
@@ -75,4 +77,6 @@ def get_module_subclasses(module: types.ModuleType, cls: type) -> list[type]:
     'EvenSimplerMLP',
     'SimpleComposerMLP',
     'TPSimpleComposerMLP',
+    'ComposerCounterModel',
+    'CountModule',
 ]
diff --git a/tests/common/models.py b/tests/common/models.py
@@ -146,6 +146,40 @@ def add_fsdp_wrap_attribute_to_children(self):
             child._fsdp_wrap = True  # type: ignore
 
 
+class CountModule(torch.nn.Module):
+
+    def __init__(self, num_inputs: int, num_outputs: int, device: Union[str, torch.device]):
+        super().__init__()
+        self.call_count = 0
+        self.inner_1 = torch.nn.Linear(num_inputs, num_outputs, device=device, bias=False)
+        self.inner_2 = torch.nn.Linear(num_outputs, num_outputs, device=device, bias=False)
+
+    def forward(self, x):
+        self.call_count += 1
+        x = self.inner_1(x)
+        x = self.inner_2(x)
+        return x
+
+
+# A simple MLP with two hidden layers where the module counts the number of times it calls forward
+# This is used to test activation checkpointing
+class ComposerCounterModel(ComposerClassifier):
+
+    def __init__(
+        self,
+        num_inputs: int,
+        num_outputs: int,
+        device: Union[str, torch.device],
+        num_hidden_layer_features: int = 8,
+    ):
+        module = torch.nn.Sequential(
+            CountModule(num_inputs, num_hidden_layer_features, device),
+            CountModule(num_hidden_layer_features, num_outputs, device),
+        )
+        super().__init__(num_classes=num_outputs, module=module)
+        self.module = module
+
+
 # Like SimpleComposerMLP but saves each layer which is necessary to TP to it.
 class TPSimpleComposerMLP(ComposerClassifier):
 
diff --git a/tests/trainer/fsdp2_context.py b/tests/trainer/fsdp2_context.py
@@ -10,14 +10,18 @@
 SKIP_TEST = version.parse(torch.__version__) < version.parse('2.6.0')
 if not SKIP_TEST:
     # TODO (FSDP2) move this to top once we deprecate torch 2.5
-    from composer.distributed import fsdp2
+    from composer.distributed import activation_checkpointing, fsdp2, prepare_distributed
+    apply_ac = activation_checkpointing.apply_ac
+    parallelize_model = prepare_distributed.parallelize_model
     prepare_fully_shard = fsdp2.prepare_fully_shard
     legalize_param_sharing_between_modules = fsdp2.legalize_param_sharing_between_modules
     get_standalone_and_tied_modules = fsdp2.get_standalone_and_tied_modules
     _recursive_apply_fully_shard = fsdp2._recursive_apply_fully_shard
     _generate_default_policy = fsdp2.generate_default_policy
     check_param_tying = fsdp2.check_param_tying
 else:
+    apply_ac = lambda *args, **kwargs: None
+    parallelize_model = lambda *args, **kwargs: None
     prepare_fully_shard = lambda *args, **kwargs: None
     legalize_param_sharing_between_modules = lambda *args, **kwargs: None
     get_standalone_and_tied_modules = lambda *args, **kwargs: ([], set())
diff --git a/tests/trainer/test_activation_checkpointing.py b/tests/trainer/test_activation_checkpointing.py
diff --git a/tests/trainer/test_fsdp2.py b/tests/trainer/test_fsdp2.py