Add the ability to load a checkpoint without restoring state (#169)

moinnadeem · Moin Nadeem · ravi-mosaicml · web-flow · commit 2b25192a27c6 · 2022-01-03T15:31:17.000-08:00
* fixing checkpoint bug

* finalizing fine-tuning a checkpointed model

* address PR feedback

* adding save_checkpoint and load_checkpoint hparams interface

* yapf &amp; pyright

* changing interface

* everyone always asks 'what is yapf', but never 'how is yapf'?

* renaming Checkpointer -&gt; CheckpointSaver

* renaming Checkpointer -&gt; CheckpointSaver

* addressing feedback &amp; friendly renaming

* addressing pyright

* yapf

* adding tests

* moving commits to BERT branch

* changing folder to be relative to run dir

* adding tests

* pyright part 1

* pyright on trainer file

* moving restoring RNG &amp; random seed to else clause

* Fix tests

* Addressed comments

Co-authored-by: Moin Nadeem &lt;moinnadeem@Moins-MacBook-Pro.local&gt;
Co-authored-by: Ravi Rahman &lt;ravi@mosaicml.com&gt;
diff --git a/composer/core/state.py b/composer/core/state.py
@@ -213,7 +213,24 @@ def state_dict(self) -> types.StateDict:
         state_dict["_is_model_ddp_wrapped"] = isinstance(self.model, DistributedDataParallel)
         return state_dict
 
-    def load_state_dict(self, state: types.StateDict):
+    def load_model_state(self, state_dict: types.StateDict, strict: bool):
+        """
+        Loads the model's state from a state_dict.
+
+        Args:
+            state_dict (types.StateDict): object returned from call to :meth:`state_dict`.
+            strict (bool): whether the keys in the state_dict should perfectly match the keys in the model.
+        """
+        if state_dict["_is_model_ddp_wrapped"] and not isinstance(self.model, DistributedDataParallel):
+            torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(state_dict['model'], "module.")
+
+            missing_keys, unexpected_keys = self.model.load_state_dict(state_dict['model'], strict=strict)
+            if len(missing_keys) > 0:
+                logger.warning(f"Found these missing keys in the checkpoint: {', '.join(missing_keys)}")
+            if len(unexpected_keys) > 0:
+                logger.warning(f"Found these unexpected keys in the checkpoint: {', '.join(unexpected_keys)}")
+
+    def load_state_dict(self, state: types.StateDict, strict: bool = False):
         """Loads the state.
 
         Args:
@@ -229,9 +246,7 @@ def load_state_dict(self, state: types.StateDict):
                 serialized_value = state[state_field_name]
 
                 if state_field_name == "model":
-                    if state["_is_model_ddp_wrapped"] and not isinstance(self.model, DistributedDataParallel):
-                        torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(serialized_value, "module.")
-                    state_field_value.load_state_dict(serialized_value)
+                    self.load_model_state(state, strict=strict)
                 else:
                     for target in ensure_tuple(state_field_value):
                         if target is None:
diff --git a/composer/datasets/hparams.py b/composer/datasets/hparams.py
@@ -40,7 +40,7 @@ def _split_fn(batch: Batch, n_microbatches: int) -> List[Batch]:
 class DataloaderSpec(NamedTuple):
     """Specification for initializing a dataloader when a device transformation function or split function
     is required
-    
+
     Parameters:
         dataloader (DataLoader): The initialized dataloader.
         device_transform_fn (TDeviceTransformFn, optional):
@@ -104,12 +104,12 @@ class DatasetHparams(hp.Hparams, abc.ABC, metaclass=metaclass):
     def initialize_object(self, batch_size: int,
                           dataloader_hparams: DataloaderHparams) -> Union[DataLoader, DataloaderSpec]:
         """Creates a :class:`DataLoader` or :class:`DataloaderSpec` for this dataset.
-        
+
         Parameters:
             batch_size (int): The size of the batch the dataloader should yield. This batch size is
                 device-specific and already incorporates the world size.
             dataloader_hparams (DataloaderHparams): The dataset-independent hparams for the dataloader
-        
+
         Returns:
             Dataloader or DataloaderSpec: The dataloader, or if a custom device transformation
                 or split function is required, a :class:`DataloaderSpec` tuple
diff --git a/composer/optim/pytorch_future.py b/composer/optim/pytorch_future.py
@@ -70,6 +70,7 @@ def __init__(self,
                  interval='step'):
         if warmup_method not in ("constant", "linear"):
             raise ValueError("Only 'constant' or 'linear' warmup_method accepted, but got {}".format(warmup_method))
+
         self.warmup_factor = warmup_factor
         self.warmup_iters = warmup_iters
         self.warmup_method = warmup_method
diff --git a/composer/trainer/checkpoint.py b/composer/trainer/checkpoint.py
@@ -23,10 +23,14 @@ class CheckpointLoader:
 
     Args:
         checkpoint_filepath (str): The path to an existing checkpoint file.
+        load_weights_only (bool): Whether to only restore the weights from the checkpoint without restoring the associated state.
+        strict_model_weights (bool): Whether to force that the checkpointed weights must exactly match the model weights.
     """
 
-    def __init__(self, checkpoint_filepath: str):
+    def __init__(self, checkpoint_filepath: str, load_weights_only: bool = False, strict_model_weights: bool = False):
         self.state_dict = torch.load(checkpoint_filepath, map_location='cpu')
+        self.load_weights_only = load_weights_only
+        self.strict_model_weights = strict_model_weights
         self.checkpoint_rng_state = None
 
     def load_checkpoint(self, state: State):
@@ -39,19 +43,22 @@ def load_checkpoint(self, state: State):
             The seed that was loaded from the checkpoint if it exists otherwise `None`.
         """
 
-        state.load_state_dict(self.state_dict["state"])
-        self.checkpoint_rng_state = self._get_checkpoint_rng_state(state, self.state_dict["rng"])
-
-        if "seed" in self.state_dict:
-            world_size = ddp.get_world_size()
-            checkpointed_world_size = len(self.state_dict["seed"])
-            if world_size != checkpointed_world_size:
-                warnings.warn(f"Current world size {world_size} does not match the checkpointed world size "
-                              f"{checkpointed_world_size}. The seed will not be restored.")
-                return
-            seed_to_restore = self.state_dict["seed"][ddp.get_global_rank()]
-            seed_all(seed_to_restore)
-            return seed_to_restore
+        if self.load_weights_only:
+            state.load_model_state(self.state_dict['state'], strict=self.strict_model_weights)
+        else:
+            state.load_state_dict(self.state_dict["state"])
+            self.checkpoint_rng_state = self._get_checkpoint_rng_state(state, self.state_dict["rng"])
+
+            if "seed" in self.state_dict:
+                world_size = ddp.get_world_size()
+                checkpointed_world_size = len(self.state_dict["seed"])
+                if world_size != checkpointed_world_size:
+                    warnings.warn(f"Current world size {world_size} does not match the checkpointed world size "
+                                  f"{checkpointed_world_size}. The seed will not be restored.")
+                    return
+                seed_to_restore = self.state_dict["seed"][ddp.get_global_rank()]
+                seed_all(seed_to_restore)
+                return seed_to_restore
 
     def restore_checkpoint_rng_state(self, state: State, device: Device):
         """Restore the state of all RNG objects in this context from the loaded checkpoint's data.
@@ -82,11 +89,11 @@ def _get_checkpoint_rng_state(self, state: State, checkpoint_rng_state: StateDic
                           f"RNG state will not be restored.")
 
 
-class Checkpointer:
+class CheckpointSaver:
     """Manager for saving state to checkpoint files.
 
     Args:
-        checkpoint_folder (str): The path to the folder to store checkpoints in.
+        checkpoint_folder (str): The path to store checkpoints in.
         checkpoint_interval (int): The amount of time units to wait between checkpoints.
         checkpoint_interval_unit (str): The unit (`"ep"` or `"it"`) that
             `checkpoint_interval` should be measured in.
diff --git a/composer/trainer/checkpoint_hparams.py b/composer/trainer/checkpoint_hparams.py
@@ -0,0 +1,58 @@
+# Copyright 2021 MosaicML. All Rights Reserved.
+
+import logging
+from dataclasses import dataclass
+
+import yahp as hp
+
+from composer.trainer.checkpoint import CheckpointLoader, CheckpointSaver
+
+log = logging.getLogger(__name__)
+
+
+@dataclass
+class CheckpointLoaderHparams(hp.Hparams):
+    """Hparams for the :class:`CheckpointLoader`.
+
+    See the documentation for the :class:`CheckpointLoader`.
+    """
+    filepath: str = hp.required(doc="Path to the serialized state_dict to recover state from.")
+    load_weights_only: bool = hp.optional(doc="Whether to only load the weights from the model.", default=False)
+    strict_model_weights: bool = hp.optional(
+        doc="Ensure that the set of weights in the checkpoint and model must exactly match.", default=False)
+
+    def validate(self):
+        if not self.load_weights_only and self.strict_model_weights:
+            raise ValueError(
+                "Strict cannot be used when load_weights_only is true. Restoring a checkpoint from previous state assumes that the checkpoint should perfectly match the model."
+            )
+
+    def initialize_object(self) -> CheckpointLoader:
+        return CheckpointLoader(checkpoint_filepath=self.filepath,
+                                load_weights_only=self.load_weights_only,
+                                strict_model_weights=self.strict_model_weights)
+
+
+@dataclass
+class CheckpointSaverHparams(hp.Hparams):
+    """Hparams for the :class:`CheckpointSaver`.
+
+    See the documentation for the :class:`CheckpointSaver`.
+    """
+    interval_unit: str = hp.required(
+        doc="Unit for the checkpoint save interval -- should be 'ep' for epochs; 'it' for iterations")
+    interval: int = hp.required(doc="Interval for checkpointing.")
+    folder: str = hp.optional(doc="Folder in which to save checkpoint files. Relative to the run directory, if set."
+                              "Defaults to `checkpoints`.",
+                              default="checkpoints")
+
+    def validate(self):
+        if self.interval < 0:
+            raise ValueError("Checkpointing interval must be greater than zero.")
+        if self.interval_unit not in ['ep', 'it']:
+            raise ValueError("Checkpointing interval unit must be one of 'ep' for epochs, or 'it' for iterations.")
+
+    def initialize_object(self) -> CheckpointSaver:
+        return CheckpointSaver(checkpoint_interval_unit=self.interval_unit,
+                               checkpoint_interval=self.interval,
+                               checkpoint_folder=self.folder)
diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py
@@ -30,7 +30,7 @@
 from composer.optim import (ComposedScheduler, CosineAnnealingLRHparams, DecoupledSGDWHparams, OptimizerHparams,
                             SchedulerHparams, WarmUpLRHparams)
 from composer.optim.scheduler import ensure_warmup_last
-from composer.trainer.checkpoint import Checkpointer, CheckpointLoader
+from composer.trainer.checkpoint import CheckpointLoader, CheckpointSaver
 from composer.trainer.deepspeed import DeepSpeedHparams
 from composer.trainer.devices.device import Device
 from composer.trainer.devices.device_cpu import DeviceCPU
@@ -93,14 +93,15 @@ class Trainer:
         log_destinations (List[BaseLoggerBackend], optional): The destinations to log training information to.
             (default ``[TQDMLoggerBackend()]``).
         callbacks (Sequence[Callback], optional): The callbacks to run during training. (default: ``[]``)
-        checkpoint_filepath (str, optional): The path to a trainer checkpoint file. If provided
-            the trainer will load the state (along with it's associated attributes) during initialization.
-            (default: ``None``)
-        checkpoint_interval_unit (int, optional): Unit for the checkpoint save interval -- should be 'ep'
-            for epochs, 'it' for iterations, or None to disable checkpointing. (default: ``None``).
-        checkpoint_folder (str, optional): The folder to save checkpoints to. Relative to `os.environ.get('RUN_DIRECTORY', '.')`,
-            (default: ``checkpoints``)
-        checkpoint_interval (int, optional): The frequency with which to checkpoint. (default: ``1``)
+        checkpoint_filepath (str): For loading checkpoints, the path to an existing checkpoint file.
+        load_weights_only (bool): Whether to only restore the weights from the checkpoint without
+            restoring the associated state.
+        strict_model_weights (bool, optional): Whether to force that the checkpointed weights must exactly
+            match the model weights.
+        checkpoint_folder (str): The path to store checkpoints in.
+        checkpoint_interval (int): The amount of time units to wait between creating checkpoints.
+        checkpoint_interval_unit (str, optional): The unit (`"ep"` or `"it"`) that
+            `checkpoint_interval` should be measured in. Set to ``None`` disables checkpointing. (default: ``None``)
         train_subset_num_batches (int, optional): If specified, finish every epoch early after training
             on this many batches. This parameter has no effect if it is greater than ``len(train_dataloader)``.
             If None (the default), then the entire dataloader will be iterated over.
@@ -150,11 +151,15 @@ def __init__(
             log_destinations: Optional[List[BaseLoggerBackend]] = None,
             callbacks: Sequence[Callback] = tuple(),
 
-            # Checkpoint hparams
+            # Checkpoint loading hparams
             checkpoint_filepath: Optional[str] = None,
+            checkpoint_load_weights_only: bool = False,
+            checkpoint_strict_model_weights: bool = False,
+
+            # Checkpoint saving hparams
             checkpoint_interval_unit: Optional[str] = None,
-            checkpoint_folder: Optional[str] = "checkpoints",
-            checkpoint_interval: Optional[int] = 1,
+            checkpoint_interval: Optional[int] = None,
+            checkpoint_folder: str = "checkpoints",
 
             # Subset parameters
             train_subset_num_batches: Optional[int] = None,
@@ -295,21 +300,26 @@ def __init__(
         self.state.optimizers = optimizer
         self.state.schedulers = ComposedScheduler(schedulers=schedulers)
 
-        self.checkpointer = None
         # TODO(#121): get checkpointing working with DeepSpeed.
-        if checkpoint_folder and checkpoint_interval and checkpoint_interval_unit:
+        self.checkpoint_saver = None
+        if checkpoint_interval is not None and checkpoint_interval_unit is not None:
+            self.checkpoint_saver = CheckpointSaver(checkpoint_interval_unit=checkpoint_interval_unit,
+                                                    checkpoint_interval=checkpoint_interval,
+                                                    checkpoint_folder=get_relative_to_run_directory(checkpoint_folder))
+
             if self.deepspeed_enabled:
                 raise NotImplementedError("Checkpointing is not yet supported with DeepSpeed.")
-            self.checkpointer = Checkpointer(checkpoint_folder=get_relative_to_run_directory(checkpoint_folder),
-                                             checkpoint_interval=checkpoint_interval,
-                                             checkpoint_interval_unit=checkpoint_interval_unit)
 
-        self.checkpoint_loader = None
         # TODO(#121): get checkpointing working with DeepSpeed.
-        if checkpoint_filepath:
+        self.checkpoint_loader = None
+        if checkpoint_filepath is not None:
             if self.deepspeed_enabled:
                 raise NotImplementedError("Checkpointing is not yet supported with DeepSpeed.")
-            self.checkpoint_loader = CheckpointLoader(checkpoint_filepath=checkpoint_filepath)
+
+            self.checkpoint_loader = CheckpointLoader(checkpoint_filepath=checkpoint_filepath,
+                                                      load_weights_only=checkpoint_load_weights_only,
+                                                      strict_model_weights=checkpoint_strict_model_weights)
+
             restored_seed = self.checkpoint_loader.load_checkpoint(state=self.state)
             # Set the restored seed so that the correct seed will be saved in future checkpoints
             # Used to handle the case where another checkpoint is saved after resuming from checkpoint.
@@ -368,6 +378,19 @@ def create_from_hparams(cls, hparams: TrainerHparams) -> Trainer:
             each evaluation epoch may load a different subset of samples."""))
         eval_dataloader = hparams.val_dataset.initialize_object(eval_device_batch_size, hparams.dataloader)
 
+        # Checkpoint loading hparams
+        checkpoint_filepath = hparams.load_checkpoint.filepath if hparams.load_checkpoint is not None else None
+        checkpoint_load_weights_only = hparams.load_checkpoint.load_weights_only \
+                                       if hparams.load_checkpoint is not None else False
+        checkpoint_strict_model_weights = hparams.load_checkpoint.strict_model_weights \
+                                          if hparams.load_checkpoint is not None else False
+
+        # Checkpoint saving hparams
+        checkpoint_interval_unit = hparams.save_checkpoint.interval_unit \
+                                   if hparams.save_checkpoint is not None else None
+        checkpoint_interval = hparams.save_checkpoint.interval if hparams.save_checkpoint is not None else None
+        checkpoint_folder = hparams.save_checkpoint.folder if hparams.save_checkpoint is not None else "checkpoints"
+
         trainer = cls(
             model=model,
             train_dataloader=train_dataloader,
@@ -400,11 +423,15 @@ def create_from_hparams(cls, hparams: TrainerHparams) -> Trainer:
             log_destinations=log_destinations,
             callbacks=tuple(callbacks),
 
-            # Checkpointing hparams
-            checkpoint_filepath=hparams.checkpoint_filepath,
-            checkpoint_interval_unit=hparams.checkpoint_interval_unit,
-            checkpoint_folder=hparams.checkpoint_folder,
-            checkpoint_interval=hparams.checkpoint_interval,
+            # Checkpoint loading hparams
+            checkpoint_filepath=checkpoint_filepath,
+            checkpoint_load_weights_only=checkpoint_load_weights_only,
+            checkpoint_strict_model_weights=checkpoint_strict_model_weights,
+
+            # Checkpoint saving hparams
+            checkpoint_interval_unit=checkpoint_interval_unit,
+            checkpoint_interval=checkpoint_interval,
+            checkpoint_folder=checkpoint_folder,
 
             # Subset parameters
             train_subset_num_batches=hparams.train_subset_num_batches,
@@ -674,11 +701,12 @@ def _ddp_reduce_tensor_sum(tensor: Tensor) -> Tensor:
                         self.eval(is_batch=True)
 
                     state.step += 1
-                    if self.checkpointer and self.checkpointer.should_checkpoint(state=state, event=Event.BATCH_END):
-                        self.checkpointer.save_checkpoint(state=state,
-                                                          seed=self.seed,
-                                                          device=self.device,
-                                                          config=self.config)
+                    if self.checkpoint_saver and self.checkpoint_saver.should_checkpoint(state=state,
+                                                                                         event=Event.BATCH_END):
+                        self.checkpoint_saver.save_checkpoint(state=state,
+                                                              seed=self.seed,
+                                                              device=self.device,
+                                                              config=self.config)
             except BreakEpochException:
                 log.info(f'Skipping the rest of Epoch {state.epoch}')
 
@@ -692,8 +720,11 @@ def _ddp_reduce_tensor_sum(tensor: Tensor) -> Tensor:
 
             state.epoch += 1
 
-            if self.checkpointer and self.checkpointer.should_checkpoint(state=state, event=Event.EPOCH_END):
-                self.checkpointer.save_checkpoint(state=state, seed=self.seed, device=self.device, config=self.config)
+            if self.checkpoint_saver and self.checkpoint_saver.should_checkpoint(state=state, event=Event.EPOCH_END):
+                self.checkpoint_saver.save_checkpoint(state=state,
+                                                      seed=self.seed,
+                                                      device=self.device,
+                                                      config=self.config)
 
         self.engine.run_event(Event.TRAINING_END)
 
diff --git a/composer/trainer/trainer_hparams.py b/composer/trainer/trainer_hparams.py
diff --git a/tests/trainer/test_checkpoint.py b/tests/trainer/test_checkpoint.py