If desired, training can be stopped on a specific step without impacting the LR curve. (#739)

jstjohn · web-flow · commit 7428f5fdcb60 · 2025-03-11T17:12:30.000Z
See new option `--early-stop-on-step` which may be used in JET tests to
stop on a specific step without impacting the LR curve by changing
`--max-steps`

---------

Signed-off-by: John St John &lt;jstjohn@nvidia.com&gt;
diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py
@@ -48,6 +48,7 @@
 from nemo.utils.exp_manager import TimingCallback
 
 from bionemo.llm.utils.datamodule_utils import infer_global_batch_size
+from bionemo.testing.testing_callbacks import SignalAfterGivenStepCallback
 
 
 torch._dynamo.config.suppress_errors = True
@@ -119,7 +120,18 @@ def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace:
     parser.add_argument(
         "--grad-acc-batches", type=int, default=1, help="Number of batches to accumulate gradients over."
     )
-    parser.add_argument("--max-steps", type=int, help="Number of training optimizer update steps.")
+    parser.add_argument(
+        "--max-steps",
+        type=int,
+        help="Number of training optimizer update steps. This controls the total number of steps as well as the "
+        "shape of the learning rate curve.",
+        default=500000,
+    )
+    parser.add_argument(
+        "--early-stop-on-step",
+        type=int,
+        help="Stop training on this step, if set. This may be useful for testing or debugging purposes.",
+    )
     parser.add_argument(
         "--val-check-interval", type=int, help="Number of steps between validation measurements and model checkpoints."
     )
@@ -468,7 +480,13 @@ def train(args: argparse.Namespace):
             save_context_on_train_end=True,
         )
         callbacks.append(checkpoint_callback)
-
+    if args.early_stop_on_step:
+        # Ask the trainer to stop by setting should_stop to True rather than emitting a kill signal.
+        callbacks.append(
+            SignalAfterGivenStepCallback(
+                stop_step=args.early_stop_on_step, stop_before_step=True, use_trainer_should_stop=True
+            )
+        )
     if args.enable_preemption:
         callbacks.append(nl_callbacks.PreemptionCallback())
     if args.debug_ddp_parity_freq > 0:
diff --git a/sub-packages/bionemo-evo2/tests/bionemo/evo2/run/test_train.py b/sub-packages/bionemo-evo2/tests/bionemo/evo2/run/test_train.py
@@ -16,9 +16,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import io
 import os
+import re
+import shlex
 import subprocess
 import sys
+from contextlib import redirect_stderr, redirect_stdout
 
 import pytest
 import torch
@@ -73,6 +77,58 @@ def test_train_evo2_runs(tmp_path, num_steps=5):
     assert result.returncode == 0, "train_evo2 command failed."
 
 
+@pytest.mark.timeout(256)  # Optional: fail if the test takes too long.
+@pytest.mark.slow
+def test_train_evo2_stops(tmp_path, num_steps=500000, early_stop_steps=3):
+    """
+    This test runs the `train_evo2` command with mock data in a temporary directory.
+    It uses the temporary directory provided by pytest as the working directory.
+    The command is run in a subshell, and we assert that it returns an exit code of 0.
+    """
+    open_port = find_free_network_port()
+    # a local copy of the environment
+    env = dict(**os.environ)
+    env["MASTER_PORT"] = str(open_port)
+
+    # Build the command string.
+    # Note: The command assumes that `train_evo2` is in your PATH.
+    command = (
+        f"train_evo2 --mock-data --experiment-dir {tmp_path}/test_train "
+        "--model-size 1b_nv --num-layers 4 --hybrid-override-pattern SDH* "
+        "--no-activation-checkpointing --add-bias-output "
+        f"--max-steps {num_steps} --early-stop-on-step {early_stop_steps} --warmup-steps 1 --no-wandb "
+        "--seq-length 128 --hidden-dropout 0.1 --attention-dropout 0.1 "
+    )
+    command_parts_no_program = shlex.split(command)[1:]
+    args = parse_args(args=command_parts_no_program)
+    with distributed_model_parallel_state():
+        # Capture stdout/stderr during train function execution
+        stdout_buffer = io.StringIO()
+        stderr_buffer = io.StringIO()
+        with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer):
+            train(args=args)
+        # Get the captured output
+        train_stdout = stdout_buffer.getvalue()
+        train_stderr = stderr_buffer.getvalue()
+        # Print the captured output for debugging
+        print("TRAIN FUNCTION STDOUT:")
+        print(train_stdout)
+        print("TRAIN FUNCTION STDERR:")
+        print(train_stderr)
+
+    # Assert that the command completed successfully.
+    assert "reduced_train_loss:" in train_stdout
+    pattern = r"\| global_step: (\d+) \|"
+
+    def extract_global_steps(log_string):
+        matches = re.findall(pattern, log_string)
+        return [int(step) for step in matches]
+
+    global_step_ints = extract_global_steps(train_stdout)
+    assert global_step_ints[-1] == early_stop_steps - 1
+    assert len(global_step_ints) == early_stop_steps
+
+
 @pytest.mark.slow
 @pytest.mark.parametrize("model_size", ["7b_nv", "7b_arc_longcontext"])
 def test_train_single_gpu(tmp_path, model_size: str):
diff --git a/sub-packages/bionemo-testing/src/bionemo/testing/testing_callbacks.py b/sub-packages/bionemo-testing/src/bionemo/testing/testing_callbacks.py
@@ -49,15 +49,32 @@ class SignalAfterGivenStepCallback(Callback, CallbackMethods):
     Use this callback for pytest based Stop and go tests.
     """
 
-    def __init__(self, stop_step: int, signal_: signal.Signals = signal.SIGUSR2):
+    def __init__(
+        self,
+        stop_step: int,
+        signal_: signal.Signals = signal.SIGUSR2,
+        use_trainer_should_stop: bool = False,
+        stop_before_step: bool = False,
+    ):
         """Initializes the callback with the given stop_step."""
-        self.stop_step = stop_step
+        # Note that the stop step will be one less than the requested step if stop_before_step is True.
+        #  this is because the first step is 0 so you get i+1 steps normally.
+        if stop_before_step:
+            self.stop_step = stop_step - 1
+        else:
+            self.stop_step = stop_step
         self.signal = signal_
+        # If True, ask the trainer to stop by setting should_stop to True rather than emitting a kill signal.
+        self.use_trainer_should_stop = use_trainer_should_stop
 
     def on_megatron_step_start(self, step: MegatronStep) -> MegatronStep:
         """Stop training if the global step is greater than or equal to the stop_step."""
         if step.trainer.global_step >= self.stop_step:
-            os.kill(os.getpid(), self.signal)
+            if self.use_trainer_should_stop:
+                # Ask the trainer to stop by setting should_stop to True rather than emitting a kill signal.
+                step.trainer.should_stop = True
+            else:
+                os.kill(os.getpid(), self.signal)
         return step