update optimizer code

rithwik-db · rithwik-db · commit 5b033b75ee3f · 2025-04-16T15:55:37.000-04:00
diff --git a/composer/distributed/fsdp2.py b/composer/distributed/fsdp2.py
@@ -8,6 +8,7 @@
 from typing import Optional, Union
 
 from torch import nn
+from torch.optim import Optimizer
 from torch.distributed._tensor.device_mesh import DeviceMesh
 from torch.distributed.fsdp._fully_shard import fully_shard
 from torch.distributed.fsdp._fully_shard._fsdp_api import MixedPrecisionPolicy, OffloadPolicy
@@ -119,6 +120,50 @@ def _check_param_sharing(module: nn.Module):
     # Start the check from the root model
     _check_param_sharing(model)
 
+def update_optimizer_modules(
+    optimizer: Optimizer,
+    modules_to_shard: list[nn.Module],
+    model: nn.Module,
+    orig_param_id_to_name: dict[int, str],
+) -> None:
+    """Updates the optimizer's parameter groups to use the sharded model parameters.
+    Assumes no training has occurred yet and the optimizer state is empty.
+
+    Args:
+        optimizer (Optimizer): The optimizer to update.
+        modules_to_shard (list[nn.Module]): The modules that will be sharded.
+        model (nn.Module): The parent model that is also sharded.
+        orig_param_id_to_name (dict[int, str]): Mapping from original parameter IDs to their names.
+    """
+    # Build a mapping from parameter name to sharded parameter (after sharding)
+    name_to_sharded_param = dict(model.named_parameters())
+    for module in modules_to_shard:
+        name_to_sharded_param.update(dict(module.named_parameters()))
+
+    # Create a mapping from old parameters to new DTensor parameters
+    old_to_new_param = {}
+    for group in optimizer.param_groups:
+        for param in group['params']:
+            param_name = orig_param_id_to_name.get(id(param))
+            if param_name is not None and param_name in name_to_sharded_param:
+                old_to_new_param[param] = name_to_sharded_param[param_name]
+            else:
+                # TODO: Look into whether we will ever hit this case...
+                raise ValueError(f"Parameter {param} not found in model")
+
+    # Update param groups with new parameters
+    new_param_groups = []
+    for group in optimizer.param_groups:
+        new_group = {k: v for k, v in group.items() if k != 'params'}
+        new_params = [old_to_new_param[param] for param in group['params']]
+        new_group['params'] = new_params
+        new_param_groups.append(new_group)
+
+    # Update param groups
+    optimizer.param_groups.clear()
+    for group in new_param_groups:
+        optimizer.add_param_group(group)
+
 
 def apply_fully_shard(
     model: nn.Module,
@@ -178,6 +223,7 @@ def apply_fully_shard(
 
 def prepare_fully_shard(
     model: nn.Module,
+    optimizer: Optional[Optimizer],
     fsdp2_config: FSDP2Config,
 ) -> None:
     """Applies FSDP2's `fully_shard` to the model according to given fsdp2_config.
@@ -190,4 +236,13 @@ def prepare_fully_shard(
         None
     """
     modules_to_shard, _ = get_standalone_and_tied_modules(list(model.children()))
+
+    # Build the parameter ID to name mapping (with no duplicates)
+    orig_param_id_to_name = {id(param): name for name, param in model.named_parameters()}
+    for module in modules_to_shard:
+        orig_param_id_to_name.update({id(param): name for name, param in module.named_parameters()})
+
     apply_fully_shard(model, modules_to_shard, fsdp2_config)
+
+    # After the model is sharded in place, we can update the optimizer state to use the DTensor parameters
+    update_optimizer_modules(optimizer, modules_to_shard, model, orig_param_id_to_name)
diff --git a/tests/trainer/test_fsdp2.py b/tests/trainer/test_fsdp2.py
@@ -5,7 +5,6 @@
 import torch
 from packaging import version
 from torch.utils.data import DataLoader
-
 from composer.models import ComposerClassifier
 from composer.trainer.trainer import Trainer
 from composer.utils import dist
@@ -28,6 +27,7 @@
 
 @pytest.mark.parametrize('model', [SimpleWeightTiedModel, PartialWeightTiedModel])
 @pytest.mark.parametrize('device', _INIT_DEVICES)
+@pytest.mark.parametrize('optimizer', [torch.optim.Adam, torch.optim.SGD])
 @world_size(2)
 @pytest.mark.gpu
 @pytest.mark.filterwarnings('ignore:FSDP2 Config/APIs are experimental*:UserWarning')
@@ -36,6 +36,7 @@ def test_fsdp2_initialization_with_tied_params(
     model: ComposerClassifier,
     world_size: int,
     device: str,
+    optimizer: type[torch.optim.Optimizer],
 ):
     """test FSDP2 initialization for a simple model with weight tying and a model where two modules
     from separate submodules have weight tying applied.
@@ -52,15 +53,27 @@ def test_fsdp2_initialization_with_tied_params(
         mp_policy=None,
         offload_policy=None,
     )
-    prepare_fully_shard(model=model.module, fsdp2_config=fsdp2_config)
+    optimizer = optimizer(model.parameters(), lr=0.1)
+    prepare_fully_shard(model=model.module, optimizer=optimizer, fsdp2_config=fsdp2_config)
 
     # Initialization checks
     assert len(model.mlp._forward_pre_hooks) == 1, 'Expected 1 forward pre-hook on the mlp module'
     assert len(model.mlp.fc1._forward_pre_hooks) == 0, 'Expected 0 forward pre-hook on the fc1 module'
     assert len(model.mlp.fc2._forward_pre_hooks) == 0, 'Expected 0 forward pre-hook on the fc2 module'
     assert len(model.module._forward_pre_hooks) == 1, 'Expected 1 forward pre-hook on the root module'
+
+    # Check that the weights are DTensor
     assert isinstance(model.mlp.fc1.weight, DTensor), 'mlp.fc1.weight should be a DTensor'
     assert isinstance(model.mlp.fc2.weight, DTensor), 'mlp.fc2.weight should be a DTensor'
+    # Check that all optimizer parameters are DTensor (we only have one param group)
+    assert len(optimizer.param_groups) == 1, 'Expected 1 param group in optimizer'
+    assert len(optimizer.param_groups[0]['params']) >= 1, 'Expected at least 1 parameter in optimizer (depends on the model)'
+    assert all(isinstance(param, DTensor) for param in optimizer.param_groups[0]['params']), 'All parameters in optimizer should be DTensor'
+    # Validate that the ids of the parameters in the optimizer exist in the model
+    model_param_ids = [id(p) for p in model.parameters()]
+    for param in optimizer.param_groups[0]['params']:
+        assert id(param) in model_param_ids, 'Parameter id in optimizer does not match parameter id in model'
+
     if isinstance(model, PartialWeightTiedModel):
         assert len(model.fc3._forward_pre_hooks) == 1, 'Expected 1 forward pre-hook on the fc3 module'
     assert model.mlp.fc1.weight.size(0) == model.mlp.fc2.weight.to_local(
@@ -72,7 +85,6 @@ def test_fsdp2_initialization_with_tied_params(
     for module in model.modules():
         model.param_init_fn(module)
 
-    optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
     trainer = Trainer(
         model=model,
         optimizers=optimizer,