fixed small bugs

rithwik-db · rithwik-db · commit 0272447ec490 · 2025-06-06T14:37:59.000-07:00
diff --git a/composer/distributed/dist_strategy.py b/composer/distributed/dist_strategy.py
@@ -31,7 +31,7 @@
     get_mixed_precision,
     set_custom_fsdp_module_kwargs,
 )
-from composer.distributed.shared_utils import add_fsdp_oom_hooks
+from composer.distributed.shared_utils import add_fsdp_oom_hooks, validate_model_requires_state_sync
 from composer.utils import FSDPConfig, StringEnum, TPConfig, dist, ensure_tuple, get_device
 
 __all__ = ['DDPSyncStrategy', 'ddp_sync_context', 'prepare_ddp_module', 'prepare_fsdp_module', 'prepare_tp_module']
@@ -246,19 +246,7 @@ def prepare_fsdp_module(
     _validate_precision(precision, device)
 
     # Check sync_module_states is True for mixed initialization or HSDP
-    if fsdp_config.sync_module_states == False:
-        rank_on_meta = 1 if next(model.parameters()).device.type == 'meta' else 0
-        all_ranks_meta = device.tensor_to_device(torch.tensor([rank_on_meta], dtype=torch.uint8))
-        dist.all_reduce(all_ranks_meta, reduce_operation='MIN')
-        any_ranks_meta = device.tensor_to_device(torch.tensor([rank_on_meta], dtype=torch.uint8))
-        dist.all_reduce(any_ranks_meta, reduce_operation='MAX')
-        if all_ranks_meta.item() == 0 and any_ranks_meta.item() == 1:
-            raise ValueError(
-                'Detected mixed initialization where some ranks have model on cpu or '
-                'gpu and some ranks are on meta. Either keep all ranks on the same '
-                "device or set parallelism_config['fsdp']['sync_module_states'] = True. Otherwise, "
-                'some weights may be randomly initialized when loading a checkpoint.',
-            )
+    validate_model_requires_state_sync(model, fsdp_config)
 
     # Handles of FSDP sync hooks if automicrobatching is on
     hook_handles = []
diff --git a/composer/distributed/prepare_distributed.py b/composer/distributed/prepare_distributed.py
@@ -15,7 +15,7 @@
 from composer.distributed.fsdp2 import prepare_fully_shard, sync_module_states
 from composer.distributed.fsdp2_utils import generate_composer_model_policy, sync_optimizer_and_model_params
 from composer.distributed.param_init import meta_init
-from composer.distributed.shared_utils import update_model_requires_state_sync
+from composer.distributed.shared_utils import validate_model_requires_state_sync
 from composer.models import ComposerModel
 from composer.utils import dist
 from composer.utils.parallelism import FSDP2Config
@@ -69,7 +69,7 @@ def _parallelize_model_helper(
     2. With sync_module_states: param_init on rank 0 first, then fully_shard, then broadcast the
        initialized state to all other ranks. This makes sure that all ranks have rank 0's model state.
     """
-    update_model_requires_state_sync(model, config)
+    validate_model_requires_state_sync(model, config)
 
     if config.sync_module_states:
         # If we are syncing module states, we assume that rank 0 has the model on CPU/GPU
diff --git a/composer/distributed/shared_utils.py b/composer/distributed/shared_utils.py
@@ -159,7 +159,7 @@ def add_fsdp_oom_hooks(model: torch.nn.Module, device: Optional[Device] = None)
     return hook_handles
 
 
-def update_model_requires_state_sync(model: nn.Module, fsdp_config: FSDP2Config | FSDPConfig) -> None:
+def validate_model_requires_state_sync(model: nn.Module, fsdp_config: FSDP2Config | FSDPConfig) -> None:
     """Checks if sync_module_states configuration is compatible with model initialization.
 
     When sync_module_states is False, this function checks that all ranks have their model
@@ -189,12 +189,20 @@ def update_model_requires_state_sync(model: nn.Module, fsdp_config: FSDP2Config
         raise ValueError(
             'Detected mixed initialization where some ranks have model on cpu or '
             'gpu and some ranks are on meta. Please set '
-            'parallelism_config["fsdp"]["sync_module_states"] = True.',
+            'parallelism_config["fsdp"]["sync_module_states"] = True. '
+            'Make sure that rank 0 is the only rank where the model is NOT on meta.',
         )
 
+    # Note: We do this instead of raising an error if
+    # (rank=0 && device=meta) || (rank!=0 && device!=meta)
+    # because that only raises an error on disparate ranks
+    # and that crashes the program, we instead just assert
+    # the setup later and note in the earlier error if the setup is invalid
     if fsdp_config.sync_module_states and all_ranks_meta.item() == 1:
         raise ValueError(
             'Detected that all ranks (including rank 0) have model on meta. '
             'Will not sync module states. Please set '
-            'parallelism_config["fsdp"]["sync_module_states"] = False.',
+            'parallelism_config["fsdp"]["sync_module_states"] = False. '
+            'If you want to sync module states, make sure that rank 0 '
+            'is the only rank where the model is NOT on meta.',
         )
diff --git a/tests/common/models.py b/tests/common/models.py
@@ -140,7 +140,7 @@ def __init__(
         num_features: int,
         device: Union[str, torch.device],
         num_classes: int = 3,
-        add_bias: bool = True,
+        add_bias: bool = False,
     ):
         fc1 = torch.nn.Linear(num_features, num_features, device=device, bias=add_bias)
         fc2 = torch.nn.Linear(num_features, num_classes, device=device, bias=add_bias)
diff --git a/tests/trainer/test_fsdp2.py b/tests/trainer/test_fsdp2.py
@@ -344,7 +344,13 @@ def _create_model_with_mixed_init(model_class: type, num_features: int, device:
         TestFSDP2MixedInit._set_deterministic_seed(seed)
 
         resolved_device = device if dist.get_local_rank() == 0 else 'meta'
-        model = model_class(num_features=num_features, device=resolved_device)
+
+        # set the bias to be True for SimpleComposerMLP
+        kwargs = {}
+        if model_class == SimpleComposerMLP:
+            kwargs['add_bias'] = True
+
+        model = model_class(num_features=num_features, device=resolved_device, **kwargs)
         model.add_fsdp_wrap_attribute_to_children()
 
         if dist.get_local_rank() == 0: