minor edits

abhi-mosaic · abhi-mosaic · commit cb45757b7cd9 · 2022-02-22T13:40:28.000-08:00
diff --git a/composer/datasets/c4.py b/composer/datasets/c4.py
@@ -256,26 +256,30 @@ def _subsample(self, device_offset, text_batch):
         return text_batch
 
     def _shard_dataset(self, dataset):
-        # Select a deterministic subset of filepaths for sharded data-parallel training
+        # Verify # of shards
         filepaths = dataset._ex_iterable.kwargs['filepaths']
         if self.num_shards != len(filepaths):
             raise ValueError(f"Found {len(filepaths)} shards, expected {self.num_shards}")
 
+        # Determine how to allocate devices to shards
         devices_per_shard = 1
-        if self.world_size > self.num_shards:
+        if self.num_shards < self.world_size:
             log.warning(
                 f"Not enough unique shards ({self.num_shards}) for world size ({self.world_size}). Splitting shards among devices."
             )
             if self.world_size % self.num_shards != 0:
                 raise ValueError(f"Cannot evenly split {self.num_shards} shards among {self.world_size} devices")
             devices_per_shard = self.world_size // self.num_shards
+        elif self.num_shards % self.world_size != 0:
+            raise ValueError(f"Cannot evenly split {self.num_shards} shards among {self.world_size} devices")
         shard_offset = self.rank // devices_per_shard
         device_offset = self.rank % devices_per_shard
 
+        # Select a deterministic subset of shards
         device_filepaths = filepaths[shard_offset::self.world_size]
         dataset._ex_iterable.kwargs['filepaths'] = device_filepaths
 
-        # Subsample dataset if shard is being shared among devices
+        # Subsample shard if shard is being shared among devices
         # NOTE: Mapping is executed in batched mode for better CPU utilization,
         # but the returned dataset is still an iterable over text samples
         if devices_per_shard > 1: