updated code

rithwik-db · rithwik-db · commit de43ee2ac2d0 · 2025-04-17T15:44:27.000-04:00
diff --git a/composer/distributed/fsdp2.py b/composer/distributed/fsdp2.py
@@ -5,14 +5,15 @@
 
 import warnings
 from dataclasses import dataclass
-from typing import Optional, Union
+from typing import Optional, Union, Sequence
 
+import torch
 from torch import nn
-from torch.optim import Optimizer
 from torch.distributed._tensor.device_mesh import DeviceMesh
 from torch.distributed.fsdp._fully_shard import fully_shard
 from torch.distributed.fsdp._fully_shard._fsdp_api import MixedPrecisionPolicy, OffloadPolicy
 
+from composer.utils import ensure_tuple
 
 @dataclass
 class FSDP2Config:
@@ -121,7 +122,7 @@ def _check_param_sharing(module: nn.Module):
     _check_param_sharing(model)
 
 def update_optimizer_modules(
-    optimizer: Optimizer,
+    optimizers: Optional[Union[torch.optim.Optimizer, Sequence[torch.optim.Optimizer]]],
     modules_to_shard: list[nn.Module],
     model: nn.Module,
     orig_param_id_to_name: dict[int, str],
@@ -135,21 +136,29 @@ def update_optimizer_modules(
         model (nn.Module): The parent model that is also sharded.
         orig_param_id_to_name (dict[int, str]): Mapping from original parameter IDs to their names.
     """
+    # Using the same logic as in FSDP1 to address multiple optimizers
+    optimizers_tuple = ensure_tuple(optimizers)
+    if len(optimizers_tuple) != 1:
+        raise NotImplementedError(f'Only one optimizer is supported; found {len(optimizers_tuple)} optimizers')
+
+    optimizer = optimizers_tuple[0]
+
     # Build a mapping from parameter name to sharded parameter (after sharding)
     name_to_sharded_param = dict(model.named_parameters())
     for module in modules_to_shard:
         name_to_sharded_param.update(dict(module.named_parameters()))
 
     # Create a mapping from old parameters to new DTensor parameters
+    # Note: if params are tied and the same parameter is in multiple groups, pytorch will raise an error
     old_to_new_param = {}
     for group in optimizer.param_groups:
         for param in group['params']:
             param_name = orig_param_id_to_name.get(id(param))
+            # Note: the names of the parameters stay the same after sharding so we can do the following.
             if param_name is not None and param_name in name_to_sharded_param:
                 old_to_new_param[param] = name_to_sharded_param[param_name]
             else:
-                # TODO: Look into whether we will ever hit this case...
-                raise ValueError(f"Parameter {param} not found in model")
+                raise ValueError(f"The same model must be passed to the optimizer and trainer.")
 
     # Update param groups with new parameters
     new_param_groups = []
@@ -223,7 +232,7 @@ def apply_fully_shard(
 
 def prepare_fully_shard(
     model: nn.Module,
-    optimizer: Optional[Optimizer],
+    optimizers: Optional[Union[torch.optim.Optimizer, Sequence[torch.optim.Optimizer]]],
     fsdp2_config: FSDP2Config,
 ) -> None:
     """Applies FSDP2's `fully_shard` to the model according to given fsdp2_config.
@@ -244,5 +253,6 @@ def prepare_fully_shard(
 
     apply_fully_shard(model, modules_to_shard, fsdp2_config)
 
-    # After the model is sharded in place, we can update the optimizer state to use the DTensor parameters
-    update_optimizer_modules(optimizer, modules_to_shard, model, orig_param_id_to_name)
+    # If the optimizer is provided, update the optimizer state to use the DTensor parameters
+    if optimizers is not None:
+        update_optimizer_modules(optimizers, modules_to_shard, model, orig_param_id_to_name)
diff --git a/tests/trainer/test_fsdp2.py b/tests/trainer/test_fsdp2.py
@@ -5,6 +5,7 @@
 import torch
 from packaging import version
 from torch.utils.data import DataLoader
+
 from composer.models import ComposerClassifier
 from composer.trainer.trainer import Trainer
 from composer.utils import dist
@@ -27,7 +28,6 @@
 
 @pytest.mark.parametrize('model', [SimpleWeightTiedModel, PartialWeightTiedModel])
 @pytest.mark.parametrize('device', _INIT_DEVICES)
-@pytest.mark.parametrize('optimizer', [torch.optim.Adam, torch.optim.SGD])
 @world_size(2)
 @pytest.mark.gpu
 @pytest.mark.filterwarnings('ignore:FSDP2 Config/APIs are experimental*:UserWarning')
@@ -36,7 +36,6 @@ def test_fsdp2_initialization_with_tied_params(
     model: ComposerClassifier,
     world_size: int,
     device: str,
-    optimizer: type[torch.optim.Optimizer],
 ):
     """test FSDP2 initialization for a simple model with weight tying and a model where two modules
     from separate submodules have weight tying applied.
@@ -53,26 +52,16 @@ def test_fsdp2_initialization_with_tied_params(
         mp_policy=None,
         offload_policy=None,
     )
-    optimizer = optimizer(model.parameters(), lr=0.1)
-    prepare_fully_shard(model=model.module, optimizer=optimizer, fsdp2_config=fsdp2_config)
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
+    prepare_fully_shard(model=model.module, optimizers=optimizer, fsdp2_config=fsdp2_config)
 
     # Initialization checks
     assert len(model.mlp._forward_pre_hooks) == 1, 'Expected 1 forward pre-hook on the mlp module'
     assert len(model.mlp.fc1._forward_pre_hooks) == 0, 'Expected 0 forward pre-hook on the fc1 module'
     assert len(model.mlp.fc2._forward_pre_hooks) == 0, 'Expected 0 forward pre-hook on the fc2 module'
     assert len(model.module._forward_pre_hooks) == 1, 'Expected 1 forward pre-hook on the root module'
-
-    # Check that the weights are DTensor
     assert isinstance(model.mlp.fc1.weight, DTensor), 'mlp.fc1.weight should be a DTensor'
     assert isinstance(model.mlp.fc2.weight, DTensor), 'mlp.fc2.weight should be a DTensor'
-    # Check that all optimizer parameters are DTensor (we only have one param group)
-    assert len(optimizer.param_groups) == 1, 'Expected 1 param group in optimizer'
-    assert len(optimizer.param_groups[0]['params']) >= 1, 'Expected at least 1 parameter in optimizer (depends on the model)'
-    assert all(isinstance(param, DTensor) for param in optimizer.param_groups[0]['params']), 'All parameters in optimizer should be DTensor'
-    # Validate that the ids of the parameters in the optimizer exist in the model
-    model_param_ids = [id(p) for p in model.parameters()]
-    for param in optimizer.param_groups[0]['params']:
-        assert id(param) in model_param_ids, 'Parameter id in optimizer does not match parameter id in model'
 
     if isinstance(model, PartialWeightTiedModel):
         assert len(model.fc3._forward_pre_hooks) == 1, 'Expected 1 forward pre-hook on the fc3 module'
@@ -98,3 +87,82 @@ def test_fsdp2_initialization_with_tied_params(
     weight_2 = model.mlp.fc2.weight.full_tensor()
     assert (model.mlp.fc1.weight is model.mlp.fc2.weight)
     assert (torch.equal(weight_1, weight_2))
+
+
+@pytest.mark.parametrize('case', ['all_params_one_group', 'subset_one_group', 'multiple_groups'])
+@pytest.mark.parametrize('device', _INIT_DEVICES)
+@world_size(2)
+@pytest.mark.gpu
+@pytest.mark.filterwarnings('ignore:FSDP2 Config/APIs are experimental*:UserWarning')
+@pytest.mark.skipif(SKIP_TEST, reason='FSDP2 is not available in torch < 2.6.0')
+def test_fsdp2_optimizer_handling(
+    case: str,
+    world_size: int,
+    device: str,
+):
+    """Test FSDP2 correctly updates optimizer state for various configurations."""
+    del world_size
+
+    num_classes = 10
+    model = PartialWeightTiedModel(num_features=num_classes, device=device)
+    dataset = RandomClassificationDataset(shape=(num_classes,), size=10, num_classes=num_classes)
+    dataloader = DataLoader(dataset, sampler=dist.get_sampler(dataset))
+
+    all_params_list = list(model.parameters())
+    fc1_params_list = list(model.mlp.fc1.parameters())
+    fc3_params_list = list(model.fc3.parameters())
+    remaining_params_list = [
+        p for p in all_params_list
+        if not any(p is param for param in fc1_params_list)
+        and not any(p is param for param in fc3_params_list)
+        and not any(p is param for param in model.mlp.fc2.parameters()) # To avoid double counting the tied parameter
+    ]
+
+    if case == 'all_params_one_group':
+        optimizer_input = [{'params': all_params_list, 'lr': 0.01}]
+    elif case == 'subset_one_group':
+        optimizer_input = [{'params': fc1_params_list, 'lr': 0.02}]
+    elif case == 'multiple_groups':
+        optimizer_input = [
+            {'params': fc1_params_list, 'lr': 0.01},
+            {'params': fc3_params_list, 'lr': 0.02},
+            {'params': remaining_params_list, 'lr': 0.03},
+        ]
+
+    optimizer = torch.optim.Adam(optimizer_input)
+
+    fsdp2_config = FSDP2Config(
+        device_mesh=None,
+        reshard_after_forward=True,
+        mp_policy=None,
+        offload_policy=None,
+    )
+    prepare_fully_shard(model=model.module, optimizers=optimizer, fsdp2_config=fsdp2_config)
+
+    def validate_optimizer_state(current_optimizer: torch.optim.Optimizer, stage: str):
+        assert len(current_optimizer.param_groups) == len(optimizer_input), \
+            f"[{case}/{stage}] Group count mismatch. Expected {len(optimizer_input)}, Got {len(current_optimizer.param_groups)}"
+        for i, group in enumerate(current_optimizer.param_groups):
+            opt_params = group['params']
+            assert len(opt_params) == len(optimizer_input[i]['params']), \
+                f"[{case}/{stage}] Group {i}: Param count mismatch. Expected {len(optimizer_input[i]['params'])}, Got {len(opt_params)}"
+            assert all(isinstance(p, DTensor) for p in opt_params), \
+                f"[{case}/{stage}] Group {i}: Not all parameters are DTensors"
+            assert group['lr'] == optimizer_input[i]['lr'], \
+                f"[{case}/{stage}] Group {i}: LR mismatch. Expected {optimizer_input[i]['lr']}, Got {group['lr']}"
+
+    validate_optimizer_state(optimizer, stage="after_fully_shard")
+
+    model.to_empty(device='cuda')
+    for module in model.modules():
+        model.param_init_fn(module)
+
+    trainer = Trainer(
+        model=model,
+        optimizers=optimizer,
+        train_dataloader=dataloader,
+        max_duration='1ba',
+    )
+    trainer.fit()
+
+    validate_optimizer_state(optimizer, stage="after_fit")