Skip to content

Commit f307e82

Browse files
coryMosaicMLravi-mosaicml
authored andcommitted
Move grad_accum logging to every step (#1187)
1 parent 91bc51c commit f307e82

File tree

1 file changed

+2
-2
lines changed

1 file changed

+2
-2
lines changed

composer/trainer/trainer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1672,7 +1672,6 @@ def _train_batch(self, use_grad_scaling: bool):
16721672
else:
16731673
original_grad_accum = self.state.grad_accum
16741674
self.state.grad_accum = min(2 * self.state.grad_accum, device_batch_size)
1675-
self.logger.data_batch({'trainer/grad_accum': self.state.grad_accum})
16761675
log.info(('CUDA out of memory detected. Gradient Accumulation '
16771676
f'increased from {original_grad_accum} -> {self.state.grad_accum}, '
16781677
'and the batch will be retrained.'))
@@ -1681,7 +1680,8 @@ def _train_batch(self, use_grad_scaling: bool):
16811680
# back only to this newly raised error.
16821681
raise caught_timeout_error
16831682
else:
1684-
# Otherwise, return calculated loss
1683+
# Otherwise, log grad_accum and return calculated loss
1684+
self.logger.data_batch({'trainer/grad_accum': self.state.grad_accum})
16851685
return total_loss
16861686

16871687
def _train_microbatches(self, microbatches: Sequence[Batch], ddp_sync: bool = True):

0 commit comments

Comments
 (0)