mosaicml
diff --git a/‎composer/core/state.py‎
100644100755
Lines changed: 26 additions & 0 deletions b/‎composer/core/state.py‎
100644100755
Lines changed: 26 additions & 0 deletions
diff --git a/‎composer/trainer/checkpoint.py‎
100644100755
Lines changed: 96 additions & 48 deletions b/‎composer/trainer/checkpoint.py‎
100644100755
Lines changed: 96 additions & 48 deletions
diff --git a/‎composer/trainer/trainer.py‎
Lines changed: 34 additions & 40 deletions b/‎composer/trainer/trainer.py‎
Lines changed: 34 additions & 40 deletions
@@ -42,6 +42,13 @@
     "scaler",
 ]
 
+# These fields will be serialized using .state_dict(), but will be skipped if DeepSpeed is enabled.
+# When DeepSpeed is being used, model and optimizer states are serialized directly by the DeepSpeed engine.
+STATE_DICT_SERIALIZATION_FIELDS_SKIP_DEEPSPEED = [
+    "model",
+    "_optimizers",
+]
+
 # These fields will not be serialized
 SKIP_SERIALIZATION_FIELDS = [
     "loss", "batch", "outputs", "train_dataloader", "eval_dataloader", "_steps_per_epoch", "_precision_context"
@@ -191,13 +198,22 @@ def state_dict(self) -> types.StateDict:
         """Returns the state as a :class:`dict`."""
         state_dict: types.StateDict = {}
 
+        deepspeed_enabled = False
+        try:
+            import deepspeed
+            deepspeed_enabled = isinstance(self.model, deepspeed.DeepSpeedEngine)
+        except ImportError:
+            pass
+
         for state_field_name, state_field_value in self.__dict__.items():
             if state_field_name in SKIP_SERIALIZATION_FIELDS:
                 continue
             elif state_field_name in DIRECT_SERIALIZATION_FIELDS:
                 state_dict[state_field_name] = state_field_value
                 continue
             elif state_field_name in STATE_DICT_SERIALIZATION_FIELDS:
+                if deepspeed_enabled and state_field_name in STATE_DICT_SERIALIZATION_FIELDS_SKIP_DEEPSPEED:
+                    continue
                 if state_field_name == "model":
                     # Save model directly instead of by class name, since model may be wrapped by DistributedDataParallel
                     serialized_value = state_field_value.state_dict()
@@ -208,9 +224,12 @@ def state_dict(self) -> types.StateDict:
                         if obj is not None
                     }
                 state_dict[state_field_name] = serialized_value
+
             else:
                 raise RuntimeError(f"Unable to serialize field {state_field_name}")
         state_dict["_is_model_ddp_wrapped"] = isinstance(self.model, DistributedDataParallel)
+        if deepspeed_enabled:
+            state_dict["_deepspeed_enabled"] = True
         return state_dict
 
     def load_model_state(self, state_dict: types.StateDict, strict: bool):
@@ -237,12 +256,19 @@ def load_state_dict(self, state: types.StateDict, strict: bool = False):
             state_dict (types.StateDict): object returned from call to :meth:`state_dict`.
 
         """
+
+        deepspeed_enabled = False
+        if "_deepspeed_enabled" in state:
+            deepspeed_enabled = state["_deepspeed_enabled"]
+
         for state_field_name, state_field_value in self.__dict__.items():
             if state_field_name in SKIP_SERIALIZATION_FIELDS:
                 continue
             elif state_field_name in DIRECT_SERIALIZATION_FIELDS:
                 setattr(self, state_field_name, state[state_field_name])
             elif state_field_name in STATE_DICT_SERIALIZATION_FIELDS:
+                if deepspeed_enabled and state_field_name in STATE_DICT_SERIALIZATION_FIELDS_SKIP_DEEPSPEED:
+                    continue
                 serialized_value = state[state_field_name]
 
                 if state_field_name == "model":
 
@@ -3,6 +3,9 @@
 import logging
 import os
 import random
+import shutil
+import tarfile
+import tempfile
 import warnings
 from typing import Any, Dict, Optional
 
@@ -18,6 +21,10 @@
 log = logging.getLogger(__name__)
 
 
+def get_mosaic_checkpoint_filepath(checkpoint_folder: str, checkpoint_tag: str):
+    return os.path.join(checkpoint_folder, checkpoint_tag, "mosaic_states.pt")
+
+
 class CheckpointLoader:
     """Manager for initializing state and restoring RNG state from existing checkpoints.
 
@@ -28,7 +35,7 @@ class CheckpointLoader:
     """
 
     def __init__(self, checkpoint_filepath: str, load_weights_only: bool = False, strict_model_weights: bool = False):
-        self.state_dict = torch.load(checkpoint_filepath, map_location='cpu')
+        self.checkpoint_filepath = checkpoint_filepath
         self.load_weights_only = load_weights_only
         self.strict_model_weights = strict_model_weights
         self.checkpoint_rng_state = None
@@ -42,25 +49,45 @@ def load_checkpoint(self, state: State):
         Returns:
             The seed that was loaded from the checkpoint if it exists otherwise `None`.
         """
+        seed_to_restore = None
 
-        if self.load_weights_only:
-            state.load_model_state(self.state_dict['state'], strict=self.strict_model_weights)
-        else:
-            state.load_state_dict(self.state_dict["state"])
-            self.checkpoint_rng_state = self._get_checkpoint_rng_state(state, self.state_dict["rng"])
-
-            if "seed" in self.state_dict:
-                world_size = ddp.get_world_size()
-                checkpointed_world_size = len(self.state_dict["seed"])
-                if world_size != checkpointed_world_size:
-                    warnings.warn(f"Current world size {world_size} does not match the checkpointed world size "
-                                  f"{checkpointed_world_size}. The seed will not be restored.")
-                    return
-                seed_to_restore = self.state_dict["seed"][ddp.get_global_rank()]
-                seed_all(seed_to_restore)
-                return seed_to_restore
-
-    def restore_checkpoint_rng_state(self, state: State, device: Device):
+        with tempfile.TemporaryDirectory() as checkpoint_folder:
+            with tarfile.open(self.checkpoint_filepath) as tarball:
+                tarball.extractall(checkpoint_folder)
+
+            checkpoint_tag = os.listdir(checkpoint_folder)[0]
+            mosaic_checkpoint_filepath = get_mosaic_checkpoint_filepath(checkpoint_folder, checkpoint_tag)
+
+            state_dict = torch.load(mosaic_checkpoint_filepath, map_location='cpu')
+
+            if self.load_weights_only:
+                state.load_model_state(state_dict['state'], strict=self.strict_model_weights)
+            else:
+                state.load_state_dict(state_dict["state"])
+                self.checkpoint_rng_state = self._get_checkpoint_rng_state(state_dict["rng"])
+
+                if "seed" in state_dict:
+                    world_size = ddp.get_world_size()
+                    checkpointed_world_size = len(state_dict["seed"])
+                    if world_size != checkpointed_world_size:
+                        warnings.warn(f"Current world size {world_size} does not match the checkpointed world size "
+                                      f"{checkpointed_world_size}. The seed will not be restored.")
+                    else:
+                        seed_to_restore = state_dict["seed"][ddp.get_global_rank()]
+                        seed_all(seed_to_restore)
+
+            try:
+                import deepspeed
+                if isinstance(state.model, deepspeed.DeepSpeedEngine):
+                    load_path, _ = state.model.load_checkpoint(checkpoint_folder, checkpoint_tag)  # type: ignore
+                    if load_path is None:
+                        raise RuntimeError(f"Failed to load DeepSpeed checkpoint from {self.checkpoint_filepath}")
+            except ImportError:
+                pass
+
+        return seed_to_restore
+
+    def restore_checkpoint_rng_state(self, device: Device):
         """Restore the state of all RNG objects in this context from the loaded checkpoint's data.
         """
 
@@ -79,7 +106,7 @@ def restore_checkpoint_rng_state(self, state: State, device: Device):
 
         self.checkpoint_rng_state = None
 
-    def _get_checkpoint_rng_state(self, state: State, checkpoint_rng_state: StateDict) -> Optional[StateDict]:
+    def _get_checkpoint_rng_state(self, checkpoint_rng_state: StateDict) -> Optional[StateDict]:
         original_world_size = len(checkpoint_rng_state["torch"])
         if original_world_size == ddp.get_world_size():
             return checkpoint_rng_state
@@ -139,39 +166,60 @@ def save_checkpoint(self, state: State, seed: int, device: Device, config: Optio
             'rng': self._get_rng_state(device=device),  # stored across all ranks
             'seed': ddp.all_gather_object(seed),
         }
-        if ddp.get_global_rank() != 0:
-            # only rank 0 saves checkpoints
-            # Need the check down here so all the DDP syncs will work for generating the checkpoint
-            return
 
-        # we add the state only on rank 0 since other processes don't have loggers to serialize
-        state_dict['state'] = state.state_dict()  # should be the same across all ranks. per-rank state not stored
-
-        if config:
-            hparams_path = os.path.join(self.checkpoint_folder, "hparams.yaml")
-            os.makedirs(self.checkpoint_folder, mode=0o775, exist_ok=True)
-            config_yaml_str = yaml.dump(config)
-            try:
-                with open(hparams_path, "x") as f:
-                    # Storing the config (ex. hparams) in a separate file so they can be modified before resuming
-                    f.write(config_yaml_str)
-            except FileExistsError as e:
-                with open(hparams_path, "r") as f:
-                    # comparing the parsed hparams to ignore whitespace and formatting differences
-                    if yaml.safe_load(config_yaml_str) != yaml.safe_load(f):
-                        raise RuntimeError(f"The hparams in the existing checkpoint folder {self.checkpoint_folder} "
-                                           "differ from those being used in the current training run. "
-                                           "Please specify a new checkpoint folder.") from e
         if self.save_event == Event.EPOCH_END:
-            filename = f"ep{state.epoch}.pt"
+            tag = f"ep{state.epoch}"
         elif self.save_event == Event.BATCH_END:
-            filename = f"it{state.step}.pt"
+            tag = f"it{state.step}"
         else:
             raise ValueError(f"Invalid checkpoint event: {self.save_event}")
-        save_file = os.path.join(self.checkpoint_folder, filename)
-        with open(save_file, 'xb') as f:
-            torch.save(state_dict, f)
-        log.info(f'Trainer checkpoint saved to {save_file}')
+
+        try:
+            import deepspeed
+            if isinstance(state.model, deepspeed.DeepSpeedEngine):
+                state.model.save_checkpoint(self.checkpoint_folder, tag)  # type: ignore
+        except ImportError:
+            pass
+
+        if ddp.get_global_rank() == 0:
+            # only rank 0 saves checkpoints
+
+            # we add the state only on rank 0 since other processes don't have loggers to serialize
+            state_dict['state'] = state.state_dict()  # should be the same across all ranks. per-rank state not stored
+
+            if config:
+                hparams_path = os.path.join(self.checkpoint_folder, "hparams.yaml")
+                os.makedirs(self.checkpoint_folder, mode=0o775, exist_ok=True)
+                config_yaml_str = yaml.dump(config)
+                try:
+                    with open(hparams_path, "x") as f:
+                        # Storing the config (ex. hparams) in a separate file so they can be modified before resuming
+                        f.write(config_yaml_str)
+                except FileExistsError as e:
+                    with open(hparams_path, "r") as f:
+                        # comparing the parsed hparams to ignore whitespace and formatting differences
+                        if yaml.safe_load(config_yaml_str) != yaml.safe_load(f):
+                            raise RuntimeError(
+                                f"The hparams in the existing checkpoint folder {self.checkpoint_folder} "
+                                "differ from those being used in the current training run. "
+                                "Please specify a new checkpoint folder.") from e
+            checkpoint_filepath = os.path.join(self.checkpoint_folder, tag)
+            mosaic_states_filepath = get_mosaic_checkpoint_filepath(self.checkpoint_folder, tag)
+            if not os.path.exists(checkpoint_filepath):
+                os.makedirs(checkpoint_filepath)
+            with open(mosaic_states_filepath, 'xb') as f:
+                torch.save(state_dict, f)
+
+            checkpoint_archive_filepath = os.path.join(self.checkpoint_folder, f'{tag}.tgz')
+            with tarfile.open(checkpoint_archive_filepath, "w:gz") as tarball:
+                tarball.add(checkpoint_filepath, arcname=tag)
+
+            shutil.rmtree(checkpoint_filepath)
+
+            log.info(f'Trainer checkpoint saved to {checkpoint_archive_filepath}')
+
+        # Ensure that the non-rank 0 processes don't exit before the checkpoint is saved.
+        ddp.barrier()
 
     def _get_rng_state(self, device: Device) -> StateDict:
         rng_state = {
 
@@ -93,7 +93,7 @@ class Trainer:
         log_destinations (List[BaseLoggerBackend], optional): The destinations to log training information to.
             (default ``[TQDMLoggerBackend()]``).
         callbacks (Sequence[Callback], optional): The callbacks to run during training. (default: ``[]``)
-        checkpoint_filepath (str): For loading checkpoints, the path to an existing checkpoint file.
+        checkpoint_filepath (str): For loading checkpoints, the path to an existing checkpoint.
         load_weights_only (bool): Whether to only restore the weights from the checkpoint without
             restoring the associated state.
         strict_model_weights (bool, optional): Whether to force that the checkpointed weights must exactly
@@ -300,33 +300,49 @@ def __init__(
         self.state.optimizers = optimizer
         self.state.schedulers = ComposedScheduler(schedulers=schedulers)
 
-        # TODO(#121): get checkpointing working with DeepSpeed.
+        assert isinstance(self.state.model, BaseMosaicModel)
+        self.original_model = self.state.model  # type: ignore  # TODO(ravi) -- update the state to add an original model helper
+
         self.checkpoint_saver = None
-        if checkpoint_interval is not None and checkpoint_interval_unit is not None:
-            self.checkpoint_saver = CheckpointSaver(checkpoint_interval_unit=checkpoint_interval_unit,
+        if checkpoint_folder and checkpoint_interval and checkpoint_interval_unit:
+            self.checkpoint_saver = CheckpointSaver(checkpoint_folder=get_relative_to_run_directory(checkpoint_folder),
                                                     checkpoint_interval=checkpoint_interval,
-                                                    checkpoint_folder=get_relative_to_run_directory(checkpoint_folder))
-
-            if self.deepspeed_enabled:
-                raise NotImplementedError("Checkpointing is not yet supported with DeepSpeed.")
+                                                    checkpoint_interval_unit=checkpoint_interval_unit)
 
-        # TODO(#121): get checkpointing working with DeepSpeed.
         self.checkpoint_loader = None
-        if checkpoint_filepath is not None:
-            if self.deepspeed_enabled:
-                raise NotImplementedError("Checkpointing is not yet supported with DeepSpeed.")
-
+        if checkpoint_filepath:
             self.checkpoint_loader = CheckpointLoader(checkpoint_filepath=checkpoint_filepath,
                                                       load_weights_only=checkpoint_load_weights_only,
                                                       strict_model_weights=checkpoint_strict_model_weights)
 
+        # place the state, model in the proper devices, and initialize from a checkpoint if provided
+        if self.deepspeed_enabled:
+            import deepspeed
+
+            assert self.deepspeed_hparams is not None
+            deepspeed_config = self.deepspeed_hparams.initialize_object(self.state, self.grad_clip_norm)
+            optimizer = ensure_tuple(self.state.optimizers)[0]
+            (self.state.model, self.state.optimizers, _, _) = deepspeed.initialize(
+                config=deepspeed_config,
+                model=self.state.model,
+                optimizer=optimizer,
+            )
+
+        # If using DeepSpeed, the model must be loaded from checkpoint after the engine has been
+        # initialized, but if using PyTorch DDP, the model must be loaded before it is wrapped with
+        # DDP.
+        if self.checkpoint_loader:
             restored_seed = self.checkpoint_loader.load_checkpoint(state=self.state)
-            # Set the restored seed so that the correct seed will be saved in future checkpoints
-            # Used to handle the case where another checkpoint is saved after resuming from checkpoint.
-            # In this case, self.seed is stored in the second checkpoint so it must have the correct value.
             if restored_seed is not None:
                 self.seed = restored_seed
 
+        if not self.deepspeed_enabled:
+            self.state.model = self.device.module_to_device(self.state.model)
+            self.state.optimizers = map_collection(self.state.optimizers, self.device.optimizer_to_device)
+
+            # wrap model with DDP
+            self.state.model = ddp.prepare_module(self.state.model, self.find_unused_parameters)
+
     @classmethod
     def create_from_hparams(cls, hparams: TrainerHparams) -> Trainer:
         """Instantiate a Trainer using a `TrainerHparams` object.
@@ -551,28 +567,6 @@ def _train_loop(self) -> None:
             raise NotImplementedError("The Mosaic trainer only supports one optimizer; "
                                       f"found {len(ensure_tuple(state.optimizers))} optimizers")
 
-        assert isinstance(state.model, BaseMosaicModel)
-        self.original_model = state.model  # type: ignore  # TODO(ravi) -- update the state to add an original model helper
-
-        # place the state, model in the proper devices
-        if self.deepspeed_enabled:
-            import deepspeed
-
-            assert self.deepspeed_hparams is not None
-            deepspeed_config = self.deepspeed_hparams.initialize_object(state, self.grad_clip_norm)
-            optimizer = ensure_tuple(state.optimizers)[0]
-            (state.model, state.optimizers, _, _) = deepspeed.initialize(
-                config=deepspeed_config,
-                model=state.model,
-                optimizer=optimizer,
-            )
-        else:
-            state.model = self.device.module_to_device(state.model)
-            state.optimizers = map_collection(state.optimizers, self.device.optimizer_to_device)
-
-            # wrap model with DDP
-            state.model = ddp.prepare_module(state.model, self.find_unused_parameters)
-
         # print training start
         self.logger.metric_fit({"trainer/algorithms": [str(algo) for algo in self.engine.algorithms]})
 
@@ -609,7 +603,7 @@ def _ddp_reduce_tensor_sum(tensor: Tensor) -> Tensor:
 
         if self.state.batch_idx == 0 and self.checkpoint_loader:
             # only restore the rng state here if the step in the current epoch is zero.
-            self.checkpoint_loader.restore_checkpoint_rng_state(self.state, self.device)
+            self.checkpoint_loader.restore_checkpoint_rng_state(self.device)
 
         for _ in range(state.epoch, state.max_epochs):
             try:
@@ -625,7 +619,7 @@ def _ddp_reduce_tensor_sum(tensor: Tensor) -> Tensor:
                     # if resuming, skip dataloader forward to the minibatch index
                     if batch_idx < self.state.batch_idx:
                         if self.checkpoint_loader:
-                            self.checkpoint_loader.restore_checkpoint_rng_state(self.state, self.device)
+                            self.checkpoint_loader.restore_checkpoint_rng_state(self.device)
                         continue
 
                     state.last_batch_size = self._get_batch_size(state.batch)