Description
BioNeMo Framework Version
Bug Description
The unit test that tests resuming of training for Evo2 model sub-packages/bionemo-evo2/tests/bionemo/evo2/run/test_train.py::test_train_evo2_stop_at_max_steps_and_continue
hangs and exceeds time limit on l40 on internal CI
It does not hang when run on A100
Steps to Reproduce
- remove @pytest.mark.skip mark
- rebuild docker
- run testing pipeline using internal CI ie L4 or L40
Error Messages and Logs
22:26:28 ________________ test_train_evo2_stop_at_max_steps_and_continue ________________
22:26:28
22:26:28 tmp_path = PosixPath('/tmp/pytest-of-root/pytest-2/test_train_evo2_stop_at_max_st0')
22:26:28
22:26:28 @pytest.mark.timeout(256) # Optional: fail if the test takes too long.
22:26:28 @pytest.mark.slow
22:26:28 def test_train_evo2_stop_at_max_steps_and_continue(tmp_path):
22:26:28 max_steps_first_run = 4
22:26:28 max_steps_second_run = 6
22:26:28 val_check_interval = 2
22:26:28 # Expected location of logs and checkpoints
22:26:28 log_dir = tmp_path / "evo2"
22:26:28 checkpoints_dir = log_dir / "checkpoints"
22:26:28
22:26:28 command_first_run = small_training_cmd(tmp_path, max_steps_first_run, val_check_interval)
22:26:28
22:26:28 # The first training command to finish at max_steps_first_run
22:26:28 stdout_first_run = run_command_in_subprocess(command=command_first_run, path=str(tmp_path))
22:26:28
22:26:28 assert f"Training epoch 0, iteration 0/{max_steps_first_run - 1}" in stdout_first_run
22:26:28 # Extract and validate global steps
22:26:28 global_steps_first_run = extract_global_steps_from_log(stdout_first_run)
22:26:28
22:26:28 assert global_steps_first_run[0] == 0
22:26:28 assert global_steps_first_run[-1] == max_steps_first_run - 1
22:26:28 assert len(global_steps_first_run) == max_steps_first_run
22:26:28
22:26:28 expected_checkpoint_first_run_suffix = f"{max_steps_first_run}.0-last"
22:26:28 # Check if checkpoints dir exists
22:26:28 assert checkpoints_dir.exists(), "Checkpoints folder does not exist."
22:26:28 # Check if any ckpt subfolder ends with the expected suffix
22:26:28 matching_subfolders = [
22:26:28 p for p in checkpoints_dir.iterdir() if p.is_dir() and (expected_checkpoint_first_run_suffix in p.name)
22:26:28 ]
22:26:28 assert matching_subfolders, (
22:26:28 f"No checkpoint subfolder ending with '{expected_checkpoint_first_run_suffix}' found in {checkpoints_dir}."
22:26:28 )
22:26:28
22:26:28 # The second training command to continue from max_steps_first_run and finish at max_steps_second_run
22:26:28 command_second_run = small_training_cmd(tmp_path, max_steps_second_run, val_check_interval)
22:26:28 > stdout_second_run = run_command_in_subprocess(command=command_second_run, path=str(tmp_path))
22:26:28
22:26:28 sub-packages/bionemo-evo2/tests/bionemo/evo2/run/test_train.py:204:
22:26:28 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
22:26:28 sub-packages/bionemo-evo2/tests/bionemo/evo2/run/test_train.py:43: in run_command_in_subprocess
22:26:28 result = subprocess.run(
22:26:28 /usr/lib/python3.12/subprocess.py:550: in run
22:26:28 stdout, stderr = process.communicate(input, timeout=timeout)
22:26:28 /usr/lib/python3.12/subprocess.py:1209: in communicate
22:26:28 stdout, stderr = self._communicate(input, endtime, timeout)
22:26:28 /usr/lib/python3.12/subprocess.py:2115: in _communicate
22:26:28 ready = selector.select(timeout)
22:26:28 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
22:26:28
22:26:28 self = <selectors.PollSelector object at 0x7f8793fc8b00>, timeout = None
22:26:28
22:26:28 def select(self, timeout=None):
22:26:28 # This is shared between poll() and epoll().
22:26:28 # epoll() has a different signature and handling of timeout parameter.
22:26:28 if timeout is None:
22:26:28 timeout = None
22:26:28 elif timeout <= 0:
22:26:28 timeout = 0
22:26:28 else:
22:26:28 # poll() has a resolution of 1 millisecond, round away from
22:26:28 # zero to wait *at least* timeout seconds.
22:26:28 timeout = math.ceil(timeout * 1e3)
22:26:28 ready = []
22:26:28 try:
22:26:28 > fd_event_list = self._selector.poll(timeout)
22:26:28 E Failed: Timeout >256.0s
22:26:28
22:26:28 /usr/lib/python3.12/selectors.py:415: Failed
Docker Image
No response
System Information
21:29:18 +-----------------------------------------------------------------------------------------+
21:29:18 | NVIDIA-SMI 560.35.03 Driver Version: 560.35.03 CUDA Version: 12.8 |
21:29:18 |-----------------------------------------+------------------------+----------------------+
21:29:18 | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
21:29:18 | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
21:29:18 | | | MIG M. |
21:29:18 |=========================================+========================+======================|
21:29:18 | 0 NVIDIA L40 On | 00000000:C1:00.0 Off | 0 |
21:29:18 | N/A 31C P8 33W / 300W | 1MiB / 46068MiB | 0% Default |
21:29:18 | | | N/A |
21:29:18 +-----------------------------------------+------------------------+----------------------+
21:29:18
21:29:18 +-----------------------------------------------------------------------------------------+
21:29:18 | Processes: |
21:29:18 | GPU GI CI PID Type Process name GPU Memory |
21:29:18 | ID ID Usage |
21:29:18 |=========================================================================================|
21:29:18 | No running processes found |
21:29:18 +-----------------------------------------------------------------------------------------+