@@ -537,19 +537,25 @@ class MultiStepWithWarmupScheduler(ComposerScheduler):
537
537
rate multiplier until the warmup has completed.
538
538
539
539
.. warning::
540
- Initial warmup time is **not** scaled according to any provided scale schedule ratio! However, the milestones
541
- will still be scaled accordingly .
540
+ By default, initial warmup time is **not** scaled according to any provided scale schedule ratio.
541
+ To change this behavior, set ``scale_warmup=True`` .
542
542
543
543
Args:
544
544
t_warmup (str | Time): Warmup time.
545
545
milestones (List[str | Time]): Times at which the learning rate should change.
546
546
gamma (float): Multiplicative decay factor. Default = ``0.1``.
547
+ scale_warmup (float): SSR also scales the warmup period. Default = ``False``.
547
548
"""
548
549
549
- def __init__ (self , t_warmup : Union [str , Time ], milestones : List [Union [str , Time ]], gamma : float = 0.1 ):
550
+ def __init__ (self ,
551
+ t_warmup : Union [str , Time ],
552
+ milestones : List [Union [str , Time ]],
553
+ gamma : float = 0.1 ,
554
+ scale_warmup : bool = False ):
550
555
self .t_warmup = t_warmup
551
556
self .milestones = milestones
552
557
self .gamma = gamma
558
+ self .scale_warmup = scale_warmup
553
559
self .warmup_scheduler = LinearScheduler (alpha_i = 0.0 , alpha_f = 1.0 , t_max = t_warmup )
554
560
self .step_scheduler = MultiStepScheduler (milestones = milestones , gamma = gamma )
555
561
@@ -563,6 +569,8 @@ def __call__(self, state: State, ssr: float = 1.0):
563
569
same unit as the trainer's max_duration parameter.""" ))
564
570
565
571
if state .timestamp < t_warmup :
572
+ if self .scale_warmup :
573
+ return self .warmup_scheduler (state , ssr )
566
574
return self .warmup_scheduler (state )
567
575
568
576
return self .step_scheduler (state , ssr )
@@ -587,17 +595,31 @@ class ConstantWithWarmupScheduler(ComposerScheduler):
587
595
Where :math:`\alpha` represents the learning rate multiplier to maintain while this scheduler is active, and
588
596
:math:`t_{max}` represents the duration of this scheduler.
589
597
598
+ .. warning::
599
+ By default, initial warmup time is **not** scaled according to any provided scale schedule ratio.
600
+ To change this behavior, set ``scale_warmup=True``.
601
+
590
602
Args:
591
603
t_warmup (str | Time): Warmup time.
592
604
alpha (float): Learning rate multiplier to maintain while this scheduler is active. Default = ``1.0``.
593
605
t_max (str | Time): Duration of this scheduler. Default = ``"1dur"``.
606
+ scale_warmup (float): SSR also scales the warmup period. Default = ``False``.
594
607
"""
595
608
596
- def __init__ (self , t_warmup : Union [str , Time ], alpha : float = 1.0 , t_max : Union [str , Time ] = '1dur' ) -> None :
609
+ def __init__ (self ,
610
+ t_warmup : Union [str , Time ],
611
+ alpha : float = 1.0 ,
612
+ t_max : Union [str , Time ] = '1dur' ,
613
+ scale_warmup : bool = False ) -> None :
597
614
self .t_warmup = t_warmup
598
615
self .alpha = alpha
599
616
self .t_max = t_max
600
- self .scheduler = LinearWithWarmupScheduler (t_warmup = t_warmup , alpha_i = alpha , alpha_f = alpha , t_max = t_max )
617
+ self .scale_warmup = scale_warmup
618
+ self .scheduler = LinearWithWarmupScheduler (t_warmup = t_warmup ,
619
+ alpha_i = alpha ,
620
+ alpha_f = alpha ,
621
+ t_max = t_max ,
622
+ scale_warmup = scale_warmup )
601
623
602
624
def __call__ (self , state : State , ssr : float = 1.0 ) -> float :
603
625
return self .scheduler (state , ssr )
@@ -628,27 +650,31 @@ class LinearWithWarmupScheduler(ComposerScheduler):
628
650
and :math:`\alpha_f` represents the learning rate multiplier to decay to, and :math:`t_{max}` represents the duration
629
651
of this scheduler.
630
652
653
+
631
654
.. warning::
632
- Initial warmup time is **not** scaled according to any provided scale schedule ratio! However, the duration of
633
- the scheduler is still scaled accordingly. To achieve this, after warmup, the scheduler's "pace " will be
634
- slightly distorted from what would otherwise be expected.
655
+ By default, the initial warmup time is **not** scaled according to any provided scale schedule ratio! However, the duration of
656
+ the scheduler is still scaled accordingly. To achieve this, after warmup, the scheduler's "slope " will be
657
+ slightly distorted from what would otherwise be expected. To scale the entire schedule, set ``scale_warmup=True``.
635
658
636
659
Args:
637
660
t_warmup (str | Time): Warmup time.
638
661
alpha_i (float): Initial learning rate multiplier. Default = ``1.0``.
639
662
alpha_f (float): Final learning rate multiplier. Default = ``0.0``.
640
663
t_max (str | Time): The duration of this scheduler. Default = ``"1dur"``.
664
+ scale_warmup (float): SSR also scales the warmup period. Default = ``False``.
641
665
"""
642
666
643
667
def __init__ (self ,
644
668
t_warmup : Union [str , Time ],
645
669
alpha_i : float = 1.0 ,
646
670
alpha_f : float = 0.0 ,
647
- t_max : Union [str , Time ] = '1dur' ):
671
+ t_max : Union [str , Time ] = '1dur' ,
672
+ scale_warmup : bool = False ):
648
673
self .t_warmup = t_warmup
649
674
self .alpha_i = alpha_i
650
675
self .alpha_f = alpha_f
651
676
self .t_max = t_max
677
+ self .scale_warmup = scale_warmup
652
678
self .warmup_scheduler = LinearScheduler (alpha_i = 0.0 , alpha_f = alpha_i , t_max = t_warmup )
653
679
654
680
def __call__ (self , state : State , ssr : float = 1.0 ):
@@ -661,6 +687,8 @@ def __call__(self, state: State, ssr: float = 1.0):
661
687
same unit as the trainer's max_duration parameter.""" ))
662
688
663
689
if state .timestamp < t_warmup :
690
+ if self .scale_warmup :
691
+ return self .warmup_scheduler (state , ssr )
664
692
return self .warmup_scheduler (state )
665
693
666
694
t_max = _convert_time (self .t_max , state , ssr = ssr )
@@ -695,20 +723,25 @@ class CosineAnnealingWithWarmupScheduler(ComposerScheduler):
695
723
:math:`\alpha_f` represents the learning rate multiplier to decay to.
696
724
697
725
.. warning::
698
- Initial warmup time is **not** scaled according to any provided scale schedule ratio! However, the duration of
699
- the scheduler is still scaled accordingly. To achieve this, after warmup, the scheduler's "pace" will be
700
- slightly distorted from what would otherwise be expected.
726
+ By default, initial warmup time is **not** scaled according to any provided scale schedule ratio.
727
+ To change this behavior, set ``scale_warmup=True``.
701
728
702
729
Args:
703
730
t_warmup (str | Time): Warmup time.
704
731
t_max (str | Time): The duration of this scheduler. Default = ``"1dur"``.
705
732
alpha_f (float): Learning rate multiplier to decay to. Default = ``0.0``.
733
+ scale_warmup (float): SSR also scales the warmup period. Default = ``False``.
706
734
"""
707
735
708
- def __init__ (self , t_warmup : Union [str , Time ], t_max : Union [str , Time ] = '1dur' , alpha_f : float = 0.0 ):
736
+ def __init__ (self ,
737
+ t_warmup : Union [str , Time ],
738
+ t_max : Union [str , Time ] = '1dur' ,
739
+ alpha_f : float = 0.0 ,
740
+ scale_warmup : bool = False ):
709
741
self .t_warmup = t_warmup
710
742
self .t_max = t_max
711
743
self .alpha_f = alpha_f
744
+ self .scale_warmup = scale_warmup
712
745
self .warmup_scheduler = LinearScheduler (alpha_i = 0.0 , alpha_f = 1.0 , t_max = t_warmup )
713
746
714
747
def __call__ (self , state : State , ssr : float = 1.0 ):
@@ -721,6 +754,8 @@ def __call__(self, state: State, ssr: float = 1.0):
721
754
same unit as the trainer's max_duration parameter.""" ))
722
755
723
756
if state .timestamp < t_warmup :
757
+ if self .scale_warmup :
758
+ return self .warmup_scheduler (state , ssr )
724
759
return self .warmup_scheduler (state )
725
760
726
761
t_max = _convert_time (self .t_max , state , ssr = ssr )
@@ -754,26 +789,28 @@ class PolynomialWithWarmupScheduler(ComposerScheduler):
754
789
:math:`\alpha_f` represents the learning rate multiplier to decay to.
755
790
756
791
.. warning::
757
- Initial warmup time is **not** scaled according to any provided scale schedule ratio! However, the duration of
758
- the scheduler is still scaled accordingly. To achieve this, after warmup, the scheduler's "pace" will be
759
- slightly distorted from what would otherwise be expected.
792
+ By default, initial warmup time is **not** scaled according to any provided scale schedule ratio.
793
+ To change this behavior, set ``scale_warmup=True``.
760
794
761
795
Args:
762
796
t_warmup (str | Time): Warmup time.
763
797
power (float): The exponent to be used for the proportionality relationship. Default = ``2.0``.
764
798
t_max (str | Time): The duration of this scheduler. Default = ``"1dur"``.
765
799
alpha_f (float): Learning rate multiplier to decay to. Default = ``0.0``.
800
+ scale_warmup (float): SSR also scales the warmup period. Default = ``False``.
766
801
"""
767
802
768
803
def __init__ (self ,
769
804
t_warmup : Union [str , Time ],
770
805
power : float = 2.0 ,
771
806
t_max : Union [str , Time ] = '1dur' ,
772
- alpha_f : float = 0.0 ):
807
+ alpha_f : float = 0.0 ,
808
+ scale_warmup : bool = False ):
773
809
self .t_warmup = t_warmup
774
810
self .power = power
775
811
self .t_max = t_max
776
812
self .alpha_f = alpha_f
813
+ self .scale_warmup = scale_warmup
777
814
self .warmup_scheduler = LinearScheduler (alpha_i = 0.0 , alpha_f = 1.0 , t_max = t_warmup )
778
815
779
816
def __call__ (self , state : State , ssr : float = 1.0 ):
@@ -786,6 +823,8 @@ def __call__(self, state: State, ssr: float = 1.0):
786
823
same unit as the trainer's max_duration parameter.""" ))
787
824
788
825
if state .timestamp < t_warmup :
826
+ if self .scale_warmup :
827
+ return self .warmup_scheduler (state , ssr )
789
828
return self .warmup_scheduler (state )
790
829
791
830
t_max = _convert_time (self .t_max , state , ssr = ssr )
0 commit comments