used settable attrs

rithwik-db · rithwik-db · commit 6872ec1f79d7 · 2025-06-06T15:34:51.000-07:00
diff --git a/composer/distributed/shared_utils.py b/composer/distributed/shared_utils.py
@@ -186,23 +186,11 @@ def validate_model_requires_state_sync(model: nn.Module, fsdp_config: FSDP2Confi
     requires_sync = all_ranks_meta.item() == 0 and any_ranks_meta.item() == 1
 
     if not fsdp_config.sync_module_states and requires_sync:
-        raise ValueError(
-            'Detected mixed initialization where some ranks have model on cpu or '
-            'gpu and some ranks are on meta. Please set '
-            'parallelism_config["fsdp"]["sync_module_states"] = True. '
-            'Make sure that rank 0 is the only rank where the model is NOT on meta.',
-        )
-
-    # Note: We do this instead of raising an error if
-    # (rank=0 && device=meta) || (rank!=0 && device!=meta)
-    # because that only raises an error on disparate ranks
-    # and that crashes the program, we instead just assert
-    # the setup later and note in the earlier error if the setup is invalid
-    if fsdp_config.sync_module_states and all_ranks_meta.item() == 1:
-        raise ValueError(
-            'Detected that all ranks (including rank 0) have model on meta. '
-            'Will not sync module states. Please set '
-            'parallelism_config["fsdp"]["sync_module_states"] = False. '
-            'If you want to sync module states, make sure that rank 0 '
-            'is the only rank where the model is NOT on meta.',
-        )
+        fsdp_config.sync_module_states = True
+
+    # Asserts that the rank setup is valid
+    if fsdp_config.sync_module_states:
+        if dist.get_global_rank() == 0:
+            assert rank_on_meta == 0
+        else:
+            assert rank_on_meta == 1
diff --git a/composer/utils/parallelism.py b/composer/utils/parallelism.py
@@ -80,14 +80,23 @@ class FSDP2Config:
     #       in most of our use cases, we can decouple these two attributes from the FSDP2Config class.
     activation_checkpointing: bool = False
     activation_cpu_offload: bool = False
-    sync_module_states: bool = False
-
     verbose: bool = False
 
+    # Settable attrs that are automatically set during training
+    _sync_module_states: bool = field(default=False, init=False, repr=False)
+
+    @property
+    def sync_module_states(self) -> bool:
+        return self._sync_module_states
+
+    @sync_module_states.setter
+    def sync_module_states(self, value: bool):
+        self._sync_module_states = value
+
     @classmethod
     def settable_attrs(cls) -> set[str]:
         """Return a set of all settable attributes of FSDP2Config."""
-        return {field.name for field in fields(cls)}
+        return {field.name for field in fields(cls) if not field.name.startswith('_')}
 
     @classmethod
     def from_compatible_attrs(cls, attrs: dict[str, Any]) -> 'FSDP2Config':
diff --git a/tests/trainer/test_fsdp2.py b/tests/trainer/test_fsdp2.py
@@ -33,7 +33,7 @@ def create_trainer_with_model(
     activation_checkpointing: bool = False,
     activation_cpu_offload: bool = False,
     auto_microbatching: bool = False,
-    sync_module_states: bool = False,
+    fsdp1_sync_module_states: bool = False,
 ) -> Trainer:
     """Helper function to create a Trainer with a model, dataloader, and FSDP2 configuration."""
     dataset = RandomClassificationDataset(shape=(num_classes,), size=2, num_classes=num_classes)
@@ -44,12 +44,11 @@ def create_trainer_with_model(
         parallelism_config.fsdp2 = FSDP2Config(
             activation_checkpointing=activation_checkpointing,
             activation_cpu_offload=activation_cpu_offload,
-            sync_module_states=sync_module_states,
         )
     else:
         parallelism_config.fsdp = FSDPConfig(
             state_dict_type='sharded',
-            sync_module_states=sync_module_states,
+            sync_module_states=fsdp1_sync_module_states,
         )
     if optimizer is None:
         optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
@@ -346,6 +345,7 @@ def _create_model_with_mixed_init(model_class: type, num_features: int, device:
         resolved_device = device if dist.get_local_rank() == 0 else 'meta'
 
         # set the bias to be True for SimpleComposerMLP
+        # which is used for a later test
         kwargs = {}
         if model_class == SimpleComposerMLP:
             kwargs['add_bias'] = True
@@ -360,14 +360,17 @@ def _create_model_with_mixed_init(model_class: type, num_features: int, device:
 
     @staticmethod
     def _train_model_and_extract_weights(model, use_fsdp2: bool):
-        """Helper function to train a model and extract its weights."""
+        """Helper function to train a mixed init model and extract its weights."""
         optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
+        kwargs = {}
+        if not use_fsdp2:
+            kwargs['fsdp1_sync_module_states'] = True
         trainer = create_trainer_with_model(
             model=model,
             max_duration=f'10ba',
             use_fsdp2=use_fsdp2,
             optimizer=optimizer,
-            sync_module_states=True,
+            **kwargs,
         )
 
         trainer.fit()
@@ -403,31 +406,6 @@ def _compare_weights(fsdp1_weights: dict, fsdp2_weights: dict, tolerance: float
             assert diff < tolerance, \
                 f'Weight difference for {name} exceeds tolerance: {diff} > {tolerance}.'
 
-    @world_size(2)
-    @pytest.mark.gpu
-    # Note that we are testing on a GPU instance just to make sure we can initialize
-    # on CPU and then move to GPU.
-    @pytest.mark.parametrize('device', ['cuda', 'cpu'])
-    def test_fsdp2_sync_module_states_raises_error_when_invalid_mixed_initialization(
-        self,
-        world_size: int,
-        device: str,
-    ):
-        """Test that FSDP2 sync_module_states raises an error when invalid mixed initialization is detected."""
-        del world_size
-        resolved_device = device if dist.get_local_rank() == 0 else 'meta'
-        model = self._create_model_with_mixed_init(SimpleComposerMLP, 10, resolved_device)
-        with pytest.raises(ValueError) as e:
-            create_trainer_with_model(model=model, num_classes=10, use_fsdp2=True, sync_module_states=False)
-        assert 'Detected mixed initialization where some ranks have model on cpu or gpu and some ranks are on meta' in str(
-            e.value,
-        )
-
-        model_2 = self._create_model_with_mixed_init(SimpleComposerMLP, 10, 'meta')
-        with pytest.raises(ValueError) as e:
-            create_trainer_with_model(model=model_2, num_classes=10, use_fsdp2=True, sync_module_states=True)
-        assert 'Detected that all ranks (including rank 0) have model on meta' in str(e.value)
-
     @world_size(2)
     @pytest.mark.gpu
     # Note that we are testing on a GPU instance just to make sure we can initialize
@@ -458,8 +436,8 @@ def param_init_fn(module: torch.nn.Module):
             model=model,
             num_classes=10,
             use_fsdp2=True,
-            sync_module_states=True,
         )
+        assert trainer.state.fsdp_config.sync_module_states, 'sync_module_states should be True'  # type: ignore
 
         module = trainer.state.model.module  # type: ignore
         assert torch.equal(
@@ -500,9 +478,8 @@ def test_fsdp2_mixed_init_does_not_break_weight_tying(
             model=model,
             num_classes=10,
             use_fsdp2=True,
-            sync_module_states=True,
         )
-
+        assert trainer.state.fsdp_config.sync_module_states, 'sync_module_states should be True'  # type: ignore
         # Check that the weights are correctly tied after training
         trainer.fit()
         weight_1 = model.mlp.fc1.weight.full_tensor()  # type: ignore
@@ -532,8 +509,9 @@ def test_fsdp2_sync_module_state_aligns_with_optimizer_state(
             num_classes=10,
             use_fsdp2=True,
             optimizer=optimizer,
-            sync_module_states=True,
         )
+
+        assert trainer.state.fsdp_config.sync_module_states, 'sync_module_states should be True'  # type: ignore
         trainer.fit()
 
         assert torch.equal(
diff --git a/tests/trainer/test_fsdp2_config.py b/tests/trainer/test_fsdp2_config.py
@@ -67,6 +67,7 @@ def test_fsdp2config_from_empty_attributes():
     assert fsdp2_config.activation_cpu_offload is False  # default value
     assert fsdp2_config.reshard_after_forward is True  # default value
     assert fsdp2_config.device_mesh is None  # default value
+    assert fsdp2_config.sync_module_states is False  # default value
 
 
 def test_fsdp2config_from_fsdp1_multiple_invalid_attributes():
@@ -76,6 +77,7 @@ def test_fsdp2config_from_fsdp1_multiple_invalid_attributes():
         'invalid_attribute1': 'value1',
         'invalid_attribute2': 'value2',
         'auto_wrap': True,
+        'sync_module_states': True,
     }
 
     with pytest.warns() as record:
@@ -87,3 +89,4 @@ def test_fsdp2config_from_fsdp1_multiple_invalid_attributes():
     assert any('invalid_attribute1: value1' in msg for msg in warning_messages)
     assert any('invalid_attribute2: value2' in msg for msg in warning_messages)
     assert any('auto_wrap: True' in msg for msg in warning_messages)
+    assert any('sync_module_states: True' in msg for msg in warning_messages)