File tree Expand file tree Collapse file tree 1 file changed +2
-2
lines changed Expand file tree Collapse file tree 1 file changed +2
-2
lines changed Original file line number Diff line number Diff line change @@ -1672,7 +1672,6 @@ def _train_batch(self, use_grad_scaling: bool):
1672
1672
else :
1673
1673
original_grad_accum = self .state .grad_accum
1674
1674
self .state .grad_accum = min (2 * self .state .grad_accum , device_batch_size )
1675
- self .logger .data_batch ({'trainer/grad_accum' : self .state .grad_accum })
1676
1675
log .info (('CUDA out of memory detected. Gradient Accumulation '
1677
1676
f'increased from { original_grad_accum } -> { self .state .grad_accum } , '
1678
1677
'and the batch will be retrained.' ))
@@ -1681,7 +1680,8 @@ def _train_batch(self, use_grad_scaling: bool):
1681
1680
# back only to this newly raised error.
1682
1681
raise caught_timeout_error
1683
1682
else :
1684
- # Otherwise, return calculated loss
1683
+ # Otherwise, log grad_accum and return calculated loss
1684
+ self .logger .data_batch ({'trainer/grad_accum' : self .state .grad_accum })
1685
1685
return total_loss
1686
1686
1687
1687
def _train_microbatches (self , microbatches : Sequence [Batch ], ddp_sync : bool = True ):
You can’t perform that action at this time.
0 commit comments