add pytest for wsd scheduler

hawkoli1987 · hawkoli1987 · commit ec5a4d2e3bef · 2025-03-14T10:59:48.000+08:00
diff --git a/nemo/core/optim/__init__.py b/nemo/core/optim/__init__.py
@@ -18,6 +18,8 @@
     CosineAnnealing,
     InverseSquareRootAnnealing,
     NoamAnnealing,
+    WarmupHoldAnnealOneMinusSquareRoot,
+    WarmupHoldAnnealLinear,
     PolynomialDecayAnnealing,
     PolynomialHoldDecayAnnealing,
     SquareAnnealing,
diff --git a/nemo/core/optim/lr_scheduler.py b/nemo/core/optim/lr_scheduler.py
@@ -1020,6 +1020,8 @@ def compute_max_steps(
     'CosineAnnealing': CosineAnnealing,
     'NoamAnnealing': NoamAnnealing,
     'NoamHoldAnnealing': NoamHoldAnnealing,
+    'WarmupHoldAnnealOneMinusSquareRoot': WarmupHoldAnnealOneMinusSquareRoot,
+    'WarmupHoldAnnealLinear': WarmupHoldAnnealLinear,
     'WarmupAnnealing': WarmupAnnealing,
     'InverseSquareRootAnnealing': InverseSquareRootAnnealing,
     'T5InverseSquareRootAnnealing': T5InverseSquareRootAnnealing,
diff --git a/nemo/lightning/pytorch/optim/__init__.py b/nemo/lightning/pytorch/optim/__init__.py
@@ -26,6 +26,7 @@
     WarmupAnnealingScheduler,
     WarmupHoldPolicyScheduler,
     WarmupPolicyScheduler,
+    WarmupHoldAnnealScheduler,
 )
 from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule
 from nemo.lightning.pytorch.optim.pytorch import PytorchOptimizerModule
@@ -47,4 +48,5 @@
     "PolynomialHoldDecayAnnealingScheduler",
     "CosineAnnealingScheduler",
     "PytorchOptimizerModule",
+    "WarmupHoldAnnealScheduler",
 ]
diff --git a/nemo/lightning/pytorch/optim/lr_scheduler.py b/nemo/lightning/pytorch/optim/lr_scheduler.py
@@ -486,7 +486,7 @@ def scheduler(self, model, optimizer):
             "monitor": self.monitor,
         }
 
-class WarmupHoldAnneal(LRSchedulerModule):
+class WarmupHoldAnnealScheduler(LRSchedulerModule):
     def __init__(
         self,
         warmup_ratio: Optional[float] = None,
diff --git a/tests/core/test_optimizers_schedulers.py b/tests/core/test_optimizers_schedulers.py
@@ -883,6 +883,165 @@ def test_InverseSquareRootAnnealing(self):
 
         assert final_lr == self.MIN_LR
 
+class TestWarmupHoldAnnealSchedulers:
+    INITIAL_LR = 0.1
+    MIN_LR = 0.01
+    MAX_STEPS = 100
+    
+    @pytest.mark.unit
+    def test_WarmupHoldAnnealOneMinusSquareRoot(self):
+        model = TempModel()
+        opt_cls = optim.get_optimizer('novograd')
+        opt = opt_cls(model.parameters(), lr=self.INITIAL_LR)
+        
+        # Test case 1: No warmup, no hold
+        policy = optim.lr_scheduler.WarmupHoldAnnealOneMinusSquareRoot(
+            opt, 
+            warmup_ratio=None, 
+            hold_ratio=None, 
+            max_steps=self.MAX_STEPS, 
+            min_lr=self.MIN_LR
+        )
+        initial_lr = policy.get_last_lr()[0]
+        assert initial_lr == self.INITIAL_LR
+        
+        # Simulate training steps
+        lrs = []
+        for i in range(self.MAX_STEPS):
+            current_lr = policy.get_last_lr()[0]
+            lrs.append(current_lr)
+            assert current_lr <= self.INITIAL_LR
+            opt.step()
+            policy.step()
+        
+        # Check final LR
+        policy.step()
+        final_lr = policy.get_last_lr()[0]
+        assert final_lr == self.MIN_LR
+        
+        # Test case 2: With warmup and hold
+        warmup_ratio = 0.1  # 10% warmup
+        hold_ratio = 0.2    # 20% hold
+        warmup_steps = int(warmup_ratio * self.MAX_STEPS)
+        hold_steps = int(hold_ratio * self.MAX_STEPS)
+        
+        policy = optim.lr_scheduler.WarmupHoldAnnealOneMinusSquareRoot(
+            opt, 
+            warmup_ratio=warmup_ratio, 
+            hold_ratio=hold_ratio, 
+            max_steps=self.MAX_STEPS, 
+            min_lr=self.MIN_LR
+        )
+        
+        initial_lr = policy.get_last_lr()[0]
+        assert initial_lr < self.INITIAL_LR  # Should start at a lower LR
+        
+        # Simulate training steps
+        lrs = []
+        for i in range(self.MAX_STEPS):
+            current_lr = policy.get_last_lr()[0]
+            lrs.append(current_lr)
+            
+            # During warmup, LR should increase
+            if i < warmup_steps:
+                if i > 0:
+                    assert current_lr >= lrs[i-1]
+                assert current_lr <= self.INITIAL_LR
+            
+            # During hold, LR should remain constant
+            elif i < warmup_steps + hold_steps:
+                assert abs(current_lr - self.INITIAL_LR) < 1e-6
+            
+            # During annealing, LR should decrease
+            else:
+                if i > warmup_steps + hold_steps:
+                    assert current_lr <= lrs[i-1]
+            
+            opt.step()
+            policy.step()
+        
+        # Check final LR
+        policy.step()
+        final_lr = policy.get_last_lr()[0]
+        assert final_lr == self.MIN_LR
+    
+    @pytest.mark.unit
+    def test_WarmupHoldAnnealLinear(self):
+        model = TempModel()
+        opt_cls = optim.get_optimizer('novograd')
+        opt = opt_cls(model.parameters(), lr=self.INITIAL_LR)
+        
+        # Test case 1: No warmup, no hold
+        policy = optim.lr_scheduler.WarmupHoldAnnealLinear(
+            opt, 
+            warmup_ratio=None, 
+            hold_ratio=None, 
+            max_steps=self.MAX_STEPS, 
+            min_lr=self.MIN_LR
+        )
+        initial_lr = policy.get_last_lr()[0]
+        assert initial_lr == self.INITIAL_LR
+        
+        # Simulate training steps
+        lrs = []
+        for i in range(self.MAX_STEPS):
+            current_lr = policy.get_last_lr()[0]
+            lrs.append(current_lr)
+            assert current_lr <= self.INITIAL_LR
+            opt.step()
+            policy.step()
+        
+        # Check final LR
+        policy.step()
+        final_lr = policy.get_last_lr()[0]
+        assert final_lr == self.MIN_LR
+        
+        # Test case 2: With warmup and hold
+        warmup_ratio = 0.1  # 10% warmup
+        hold_ratio = 0.2    # 20% hold
+        warmup_steps = int(warmup_ratio * self.MAX_STEPS)
+        hold_steps = int(hold_ratio * self.MAX_STEPS)
+        
+        policy = optim.lr_scheduler.WarmupHoldAnnealLinear(
+            opt, 
+            warmup_ratio=warmup_ratio, 
+            hold_ratio=hold_ratio, 
+            max_steps=self.MAX_STEPS, 
+            min_lr=self.MIN_LR
+        )
+        
+        initial_lr = policy.get_last_lr()[0]
+        assert initial_lr < self.INITIAL_LR  # Should start at a lower LR
+        
+        # Simulate training steps
+        lrs = []
+        for i in range(self.MAX_STEPS):
+            current_lr = policy.get_last_lr()[0]
+            lrs.append(current_lr)
+            
+            # During warmup, LR should increase
+            if i < warmup_steps:
+                if i > 0:
+                    assert current_lr >= lrs[i-1]
+                assert current_lr <= self.INITIAL_LR
+            
+            # During hold, LR should remain constant
+            elif i < warmup_steps + hold_steps:
+                assert abs(current_lr - self.INITIAL_LR) < 1e-6
+            
+            # During annealing, LR should decrease
+            else:
+                if i > warmup_steps + hold_steps:
+                    assert current_lr <= lrs[i-1]
+            
+            opt.step()
+            policy.step()
+        
+        # Check final LR
+        policy.step()
+        final_lr = policy.get_last_lr()[0]
+        assert final_lr == self.MIN_LR
+
     @pytest.mark.unit
     def test_CosineAnnealing_with_noop_steps(self):
         model = TempModel()

Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,7 @@`
`26`	`26`	`WarmupAnnealingScheduler,`
`27`	`27`	`WarmupHoldPolicyScheduler,`
`28`	`28`	`WarmupPolicyScheduler,`
	`29`	`+ WarmupHoldAnnealScheduler,`
`29`	`30`	`)`
`30`	`31`	`from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule`
`31`	`32`	`from nemo.lightning.pytorch.optim.pytorch import PytorchOptimizerModule`
`@@ -47,4 +48,5 @@`
`47`	`48`	`"PolynomialHoldDecayAnnealingScheduler",`
`48`	`49`	`"CosineAnnealingScheduler",`
`49`	`50`	`"PytorchOptimizerModule",`
	`51`	`+ "WarmupHoldAnnealScheduler",`
`50`	`52`	`]`
Original file line number	Diff line number	Diff line change
`@@ -486,7 +486,7 @@ def scheduler(self, model, optimizer):`
`486`	`486`	`"monitor": self.monitor,`
`487`	`487`	`}`
`488`	`488`
`489`		`-class WarmupHoldAnneal(LRSchedulerModule):`
	`489`	`+class WarmupHoldAnnealScheduler(LRSchedulerModule):`
`490`	`490`	`def __init__(`
`491`	`491`	`self,`
`492`	`492`	`warmup_ratio: Optional[float] = None,`