[FSDP2] Init FSDP2 based checkpointing (#3824)

bowenyang008 · web-flow · commit 03ac462c5504 · 2025-04-17T10:39:41.000-07:00
Signed-off-by: dependabot[bot] &lt;support@github.com&gt;
Co-authored-by: Daniel King &lt;43149077+dakinggg@users.noreply.github.com&gt;
Co-authored-by: dependabot[bot] &lt;49699333+dependabot[bot]@users.noreply.github.com&gt;

Kick start support of FSDP2 checkpointing along with Trainer and State:

Support same interface in State.fsdp_config between FSDP2Config and FSDPConfig so both config can be passed to Trainer/State. Currently FSDP2Config supports a minimum default interface methods/properties to make State functional
Support FSDP2 based checkpointing by State
Support loading checkpoints from FSDP1 artifacts to FSDP2
diff --git a/composer/core/state.py b/composer/core/state.py
@@ -43,6 +43,7 @@
 from composer.core.time import Time, Timestamp, TimeUnit, ensure_time
 from composer.devices import Device
 from composer.utils import (
+    FSDP2Config,
     FSDPConfig,
     ParallelismConfig,
     ParallelismType,
@@ -196,7 +197,7 @@ def _ensure_backwards_compatible_checkpointing(state_dict: dict[str, Any]):
 
 def _create_device_mesh(
     device: Device,
-    fsdp_config: Optional[FSDPConfig],
+    fsdp_config: Optional[FSDPConfig | FSDP2Config],
     tp_config: Optional[TPConfig],
 ) -> Optional[DeviceMesh]:
     if version.parse(torch.__version__.split('.dev')[0]) < version.parse('2.3.0'):
@@ -536,7 +537,8 @@ def __init__(
 
         self.profiler: Optional[Profiler] = None
 
-        self.fsdp_config = parallelism_config.fsdp if parallelism_config is not None else None
+        self._fsdp_config = parallelism_config.fsdp if parallelism_config is not None else None
+        self._fsdp2_config = parallelism_config.fsdp2 if parallelism_config is not None else None
         self.tp_config = parallelism_config.tp if parallelism_config is not None else None
 
         self.automicrobatch_fsdp_hook_handles = []
@@ -873,6 +875,27 @@ def evaluators(self):
     def evaluators(self, evaluators: Union[Evaluator, Sequence[Evaluator]]):
         self._evaluators[:] = list(ensure_tuple(evaluators))
 
+    @property
+    def fsdp_config(self):
+        """Returns the appropriate FSDP configuration to use.
+
+        Prioritizes FSDP2 config if available, otherwise falls back to FSDP1 config.
+        """
+        return self._fsdp2_config if self._fsdp2_config is not None else self._fsdp_config
+
+    # For backward compatibility
+    @fsdp_config.setter
+    def fsdp_config(self, value: FSDPConfig | FSDP2Config):
+        """Sets the FSDP configuration, handling both FSDP1 and FSDP2 configurations."""
+        if isinstance(value, FSDPConfig):
+            self._fsdp_config = value
+            self._fsdp2_config = None
+        elif isinstance(value, FSDP2Config):
+            self._fsdp2_config = value
+            self._fsdp_config = None
+        else:
+            raise TypeError(f'Expected value to be of type FSDPConfig or FSDP2Config, but got {type(value)}.')
+
     @property
     def fsdp_enabled(self):
         """Indicates if FSDP is enabled."""
@@ -1384,6 +1407,11 @@ def load_model_state(
             with reproducibility.seed_context(self.rank_zero_seed):
                 from composer.distributed import prepare_fsdp_module
 
+                # TODO (FSDP2): support calling FSDP2 wrapper depending on the config type
+                assert isinstance(
+                    self.fsdp_config,
+                    FSDPConfig,
+                ), f'prepare_fsdp_module requires FSDPConfig, got: {type(self.fsdp_config)}'
                 self.automicrobatch_fsdp_hook_handles, self.fsdp_modules = prepare_fsdp_module(
                     self.model,
                     self.optimizers,
diff --git a/composer/distributed/fsdp2.py b/composer/distributed/fsdp2.py
@@ -3,39 +3,10 @@
 
 """Helpers for FSDP2."""
 
-import warnings
-from dataclasses import dataclass
-from typing import Optional, Union
-
-from torch import nn
-from torch.distributed._tensor.device_mesh import DeviceMesh
+import torch.nn as nn
 from torch.distributed.fsdp._fully_shard import fully_shard
-from torch.distributed.fsdp._fully_shard._fsdp_api import MixedPrecisionPolicy, OffloadPolicy
-
 
-@dataclass
-class FSDP2Config:
-    """Configuration for Fully Sharded Data Parallelism (FSDP2).
-
-    Args:
-        device_mesh (Optional[DeviceMesh]): The DeviceMesh for sharding. If None, a default 1D mesh is created.
-            For 1D mesh, parameters are fully sharded across the mesh (FSDP).
-            For 2D mesh, parameters are sharded across the 1st dimension and replicated across the 0th dimension (HSDP).
-        reshard_after_forward (Union[bool, int]): Controls parameter behavior after forward:
-            - If True, reshards parameters after forward, re-all-gathers in backward.
-            - If False, keeps unsharded parameters in memory, avoids all-gather in backward.
-            - If int, reshards to smaller world size after forward.
-            Default: True
-        mp_policy (Optional[MixedPrecisionPolicy]): Mixed precision policy. Default: None
-        offload_policy (Optional[OffloadPolicy]): Offloading policy. Default: None
-    """
-    device_mesh: Optional[DeviceMesh] = None
-    reshard_after_forward: Union[bool, int] = True
-    mp_policy: Optional[MixedPrecisionPolicy] = None
-    offload_policy: Optional[OffloadPolicy] = None
-
-    def __post_init__(self):
-        warnings.warn('FSDP2 Config/APIs are experimental and subject to heavy changes', UserWarning)
+from composer.utils.parallelism import FSDP2Config
 
 
 def get_standalone_and_tied_modules(modules: list[nn.Module]) -> tuple[list[nn.Module], set[nn.Module]]:
@@ -127,6 +98,8 @@ def apply_fully_shard(
 ) -> None:
     """Applies FSDP2's `fully_shard` to the specified modules and then to the parent model.
 
+    NOTE FSDP are only applied to nn.Parameters not Buffers.
+
     Args:
         model (torch.nn.Module): The parent model.
         independent_submodules (list[torch.nn.Module]): The modules to apply fully_shard to.
@@ -136,22 +109,18 @@ def apply_fully_shard(
         None
     """
     fully_shard_kwargs = {'mesh': fsdp2_config.device_mesh, 'reshard_after_forward': fsdp2_config.reshard_after_forward}
-    if fsdp2_config.mp_policy:
-        fully_shard_kwargs['mp_policy'] = fsdp2_config.mp_policy
-    if fsdp2_config.offload_policy:
-        fully_shard_kwargs['offload_policy'] = fsdp2_config.offload_policy
 
     # Apply fully_shard to each module in the list
     if len(independent_submodules) == 0:
         raise RuntimeError(
-            "Can't find any submodules to apply FSDP, e.g., the submodules may all have tied weights. Applying FSDP to the root model does not provide any memory savings.",
+            "Can't find any submodules to apply FSDP, e.g., the submodules may all have tied parameters. Applying FSDP to the root model does not provide any memory savings.",
         )
 
     independent_submodules, modules_tied = get_standalone_and_tied_modules(independent_submodules)
     if len(modules_tied) > 0:
         raise RuntimeError(
-            'Submodules to be sharded have tied weights. FSDP cannot be applied to modules with tied weights independently. '
-            'Please ensure that the submodules do not have tied weights.',
+            'Submodules to be sharded have tied parameters. FSDP cannot be applied to modules with tied parameters independently. '
+            'Please ensure that the submodules do not have tied parameters.',
         )
 
     # NOTE there is a bug fully_shard can not handle when the model has a child module which is the child of another
diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py
@@ -1629,7 +1629,11 @@ def __init__(
                     f'Using closures and precision {self.state.precision} is not supported'
                     f' with FSDP. Please use another optimizer or precision type.',
                 )
-            self.state.scaler = ShardedGradScaler()
+            if isinstance(self.state.fsdp_config, FSDPConfig):
+                # Per TorchTitan doc and FSDP2 test: test_fsdp2_gradscaler.py,
+                # GradScaler can already handle state synchronization via torch._amp_foreach_non_finite_check_and_unscale_,
+                # so we don't need to use ShardedGradScaler
+                self.state.scaler = ShardedGradScaler()
 
         # suppressing FSDP warning when auto grad accum exits the forward pass before completing
         warnings.filterwarnings(action='ignore', message='Forward order differs from that of the first iteration')
@@ -1657,6 +1661,11 @@ def __init__(
         if self.state.fsdp_config is not None and self.state.fsdp_config.auto_wrap and not self.state.load_monolith_rank0_only:
             # Init with globally fixed seed so all HSDP replicas have the same initial weights
             with reproducibility.seed_context(self.state.rank_zero_seed):
+                # TODO (FSDP2): support calling FSDP2 wrapper depending on the config type
+                assert isinstance(
+                    self.state.fsdp_config,
+                    FSDPConfig,
+                ), f'prepare_fsdp_module requires FSDPConfig, got: {type(self.state.fsdp_config)}'
                 self.state.automicrobatch_fsdp_hook_handles, self.state.fsdp_modules = prepare_fsdp_module(
                     model,
                     optimizers,
@@ -1790,6 +1799,11 @@ def __init__(
             not self.state.fsdp_enabled and self.state.fsdp_config is not None and self.state.fsdp_config.auto_wrap and
             self.state.load_monolith_rank0_only
         ):
+            # TODO (FSDP2): support calling FSDP2 wrapper depending on the config type
+            assert isinstance(
+                self.state.fsdp_config,
+                FSDPConfig,
+            ), f'prepare_fsdp_module requires FSDPConfig, got: {type(self.state.fsdp_config)}'
             # Init with globally fixed seed so all HSDP replicas have the same initial weights
             with reproducibility.seed_context(self.state.rank_zero_seed):
                 self.state.automicrobatch_fsdp_hook_handles, self.state.fsdp_modules = prepare_fsdp_module(
diff --git a/composer/utils/__init__.py b/composer/utils/__init__.py
@@ -75,7 +75,7 @@
     UCObjectStore,
     build_remote_backend,
 )
-from composer.utils.parallelism import FSDPConfig, ParallelismConfig, TPConfig
+from composer.utils.parallelism import FSDP2Config, FSDPConfig, ParallelismConfig, TPConfig
 from composer.utils.remote_uploader import RemoteFilesExistingCheckStatus, RemoteUploader
 from composer.utils.retrying import retry
 from composer.utils.string_enum import StringEnum
@@ -152,6 +152,7 @@
     'STR_TO_DTYPE',
     'ParallelismType',
     'FSDPConfig',
+    'FSDP2Config',
     'TPConfig',
     'ParallelismConfig',
     'MLFLOW_EXPERIMENT_ID_FORMAT_KEY',
diff --git a/composer/utils/parallelism.py b/composer/utils/parallelism.py
@@ -3,6 +3,7 @@
 
 """Parallelism configs."""
 
+import warnings
 from dataclasses import dataclass, field
 from typing import Any, Optional
 
@@ -61,6 +62,72 @@ def device_mesh(self, value: Optional[DeviceMesh]):
         self._device_mesh = value
 
 
+@dataclass
+class FSDP2Config:
+    """Configuration for Fully Sharded Data Parallelism (FSDP2).
+
+    Args:
+        device_mesh (Optional[DeviceMesh]): The DeviceMesh for sharding. If None, a default 1D mesh is created.
+            For 1D mesh, parameters are fully sharded across the mesh (FSDP).
+            For 2D mesh, parameters are sharded across the 1st dimension and replicated across the 0th dimension (HSDP).
+        reshard_after_forward (Union[bool, int]): Controls parameter behavior after forward.
+    """
+
+    # Settable core FSDP2 attrs
+    device_mesh: Optional[DeviceMesh] = None
+    reshard_after_forward: bool | int = True
+
+    ### Temporary read-only properties for FSDP 1 compatibility  ###
+    # to be supported in FSDP2
+    @property
+    def auto_wrap(self) -> bool:
+        return False
+
+    @property
+    def load_monolith_rank0_only(self) -> bool:
+        return False
+
+    @property
+    def sync_module_states(self) -> bool:
+        return False
+
+    @property
+    def load_planner(self) -> Optional[Any]:
+        return None
+
+    @property
+    def save_planner(self) -> Optional[Any]:
+        return None
+
+    @property
+    def sharded_ckpt_prefix_dir(self) -> str:
+        return 'ep{epoch}-ba{batch}'
+
+    @property
+    def activation_cpu_offload(self) -> bool:
+        return False
+
+    @property
+    def data_parallel_shard_degree(self) -> int:
+        return -1
+
+    @property
+    def data_parallel_replicate_degree(self) -> Optional[int]:
+        return None
+
+    # to be deprecated in FSDP2
+    @property
+    def state_dict_type(self) -> str:
+        return 'sharded'
+
+    @property
+    def use_orig_params(self) -> bool:
+        return True
+
+    def __post_init__(self):
+        warnings.warn('FSDP2 Config/APIs are experimental and subject to heavy changes', UserWarning)
+
+
 @dataclass
 class TPConfig:
     """Configuration for tensor parallelism (TP)."""
@@ -74,3 +141,4 @@ class ParallelismConfig:
     """Configuration for parallelism."""
     fsdp: Optional[FSDPConfig] = None
     tp: Optional[TPConfig] = None
+    fsdp2: Optional[FSDP2Config] = None
diff --git a/tests/trainer/fsdp2_context.py b/tests/trainer/fsdp2_context.py
@@ -0,0 +1,27 @@
+# Copyright 2024 MosaicML Composer authors
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Callable, Optional
+
+import pytest
+import torch
+from packaging import version
+
+SKIP_TEST = version.parse(torch.__version__) < version.parse('2.6.0')
+if not SKIP_TEST:
+    # TODO (FSDP2) move this to top once we decprecate torch 2.5
+    from composer.distributed import fsdp2
+    prepare_fully_shard = fsdp2.prepare_fully_shard
+    legalize_param_sharing_between_modules = fsdp2.legalize_param_sharing_between_modules
+    get_standalone_and_tied_modules = fsdp2.get_standalone_and_tied_modules
+else:
+    prepare_fully_shard = lambda *args, **kwargs: None
+    legalize_param_sharing_between_modules = lambda *args, **kwargs: None
+    get_standalone_and_tied_modules = lambda *args, **kwargs: ([], set())
+
+
+def fsdp2_context(func: Callable) -> Optional[Callable]:
+    """Decorator to run tests with models initialized on the meta device for torch version 2.6+."""
+    func = pytest.mark.skipif(SKIP_TEST, reason='Skipping test for torch version < 2.6.0')(func)
+    func = pytest.mark.filterwarnings('ignore:FSDP2 Config/APIs are experimental*:UserWarning')(func)
+    return func
diff --git a/tests/trainer/test_fsdp2.py b/tests/trainer/test_fsdp2.py
diff --git a/tests/trainer/test_fsdp2_gradscaler.py b/tests/trainer/test_fsdp2_gradscaler.py
diff --git a/tests/trainer/test_fsdp2_helpers.py b/tests/trainer/test_fsdp2_helpers.py