Skip to content

Commit d106dca

Browse files
committed
Replace shell commands with subprocess.run
Signed-off-by: Bruno Alvisio <[email protected]>
1 parent a6f5a7f commit d106dca

File tree

6 files changed

+185
-56
lines changed

6 files changed

+185
-56
lines changed

ci/scripts/pytest_runner.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ SKIP_SLOW=false
5757
ONLY_SLOW=false
5858
ALLOW_NO_TESTS=false
5959
# TODO(@cspades): Ignore this Evo2 notebook test, which has a tendency to leave a 32GB orphaned process in GPU.
60-
declare -a IGNORE_FILES=("sub-packages/bionemo-evo2/examples/fine-tuning-tutorial.ipynb")
60+
declare -a IGNORE_FILES=()
6161
error=false
6262

6363
# Parse command line arguments

ci/scripts/run_pytest_notebooks.sh

100644100755
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,4 @@
1919
# Enable strict mode with better error handling
2020
set -euox pipefail
2121

22-
pytest -v --nbval-lax -p no:python docs/ sub-packages/
22+
pytest -v --nbval-lax -x -p no:python docs/ sub-packages/

pyproject.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,6 @@ addopts = [
119119
"--durations-min=30.0",
120120
"--durations=0",
121121
"--ignore=3rdparty",
122-
"--ignore-glob=sub-packages/bionemo-evo2/examples/fine-tuning-tutorial.ipynb",
123122
"--ignore-glob=sub-packages/bionemo-moco/examples/discrete_data_interpolant_tutorial.ipynb"
124123
]
125124
markers = ["slow: marks tests as slow (deselect with '-m \"not slow\"')"]
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: LicenseRef-Apache2
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
17+
import logging
18+
import subprocess
19+
from typing import Any, Dict, List
20+
21+
22+
logger = logging.getLogger(__name__)
23+
24+
25+
def run_subprocess_safely(command: List[str], timeout: int = 2000) -> Dict[str, Any]:
26+
"""Run a subprocess and raise an error if it fails.
27+
28+
Args:
29+
command: The command to run.
30+
timeout: The timeout for the command.
31+
32+
Returns:
33+
The result of the subprocess.
34+
"""
35+
try:
36+
result = subprocess.run(command, capture_output=True, timeout=timeout, check=True, text=True)
37+
return {"stdout": result.stdout, "stderr": result.stderr, "returncode": result.returncode}
38+
except subprocess.TimeoutExpired as e:
39+
logger.error(f"Command timed out. Command: {command}\nstdout:\n{e.stdout}\nstderr:\n{e.stderr}")
40+
return {"error": "timeout", "stdout": e.stdout, "stderr": e.stderr, "returncode": None}
41+
42+
except subprocess.CalledProcessError as e:
43+
logger.error(
44+
f"Command failed. Command: {command}\nreturncode: {e.returncode}\nstdout:\n{e.stdout}\nstderr:\n{e.stderr}"
45+
)
46+
return {"error": "non-zero exit", "stdout": e.stdout, "stderr": e.stderr, "returncode": e.returncode}
47+
48+
except FileNotFoundError as e:
49+
logger.error(f"Command not found. Command: {command}\nstderr:\n{str(e)}")
50+
return {"error": "not found", "stdout": "", "stderr": str(e), "returncode": None}
51+
52+
except Exception as e:
53+
# catch-all for other unexpected errors
54+
return {"error": "other", "message": str(e), "stdout": "", "stderr": "", "returncode": None}

sub-packages/bionemo-evo2/examples/fine-tuning-tutorial.ipynb

Lines changed: 64 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -93,9 +93,10 @@
9393
"metadata": {},
9494
"outputs": [],
9595
"source": [
96-
"%%capture\n",
9796
"import os\n",
9897
"\n",
98+
"from bionemo.core.utils.subprocess_utils import run_subprocess_safely\n",
99+
"\n",
99100
"\n",
100101
"concat_path = \"chr20_21_22.fa\"\n",
101102
"if not os.path.exists(concat_path):\n",
@@ -158,7 +159,6 @@
158159
"metadata": {},
159160
"outputs": [],
160161
"source": [
161-
"%%capture\n",
162162
"!preprocess_evo2 --config preprocess_config.yaml"
163163
]
164164
},
@@ -207,7 +207,6 @@
207207
"metadata": {},
208208
"outputs": [],
209209
"source": [
210-
"%%capture\n",
211210
"if not os.path.exists(\"nemo2_evo2_1b_8k\"):\n",
212211
" !evo2_convert_to_nemo2 \\\n",
213212
" --model-path hf://arcinstitute/savanna_evo2_1b_base \\\n",
@@ -262,46 +261,77 @@
262261
"metadata": {},
263262
"outputs": [],
264263
"source": [
265-
"%%capture\n",
266264
"MAX_STEPS: int = 10 if FAST_CI_MODE else 100\n",
267265
"val_check_interval = min(int(MAX_STEPS // 2), 50)\n",
268266
"warmup_steps = min(MAX_STEPS, 100)\n",
269267
"# For evo2 training and fine-tuning follow the same set of steps, so we use the same train_evo2 command.\n",
270268
"# the big difference is the --ckpt-dir argument which points to a pre-existing checkpoint from some other training run.\n",
271269
"\n",
272270
"if FAST_CI_MODE:\n",
273-
" model_subset_option = (\n",
274-
" \"--num-layers 4 --hybrid-override-pattern SDH* --activation-checkpoint-recompute-num-layers 2\"\n",
275-
" )\n",
271+
" model_subset_option = [\n",
272+
" \"--num-layers\",\n",
273+
" \"4\",\n",
274+
" \"--hybrid-override-pattern\",\n",
275+
" \"SDH*\",\n",
276+
" \"--activation-checkpoint-recompute-num-layers\",\n",
277+
" \"2\",\n",
278+
" ]\n",
276279
"else:\n",
277280
" # By default do 5 layers of activation checkpointing\n",
278-
" model_subset_option = \"--activation-checkpoint-recompute-num-layers 5\"\n",
279-
"train_cmd = f\"\"\"train_evo2 \\\n",
280-
" -d training_data_config.yaml \\\n",
281-
" --dataset-dir ./preprocessed_data \\\n",
282-
" --result-dir pretraining_demo \\\n",
283-
" --experiment-name evo2 \\\n",
284-
" --model-size 1b \\\n",
285-
" --devices 1 \\\n",
286-
" --num-nodes 1 \\\n",
287-
" --seq-length 8192 \\\n",
288-
" --micro-batch-size 2 \\\n",
289-
" --lr 0.000015 \\\n",
290-
" --min-lr 0.0000149 \\\n",
291-
" --warmup-steps {warmup_steps} \\\n",
292-
" --grad-acc-batches 4 \\\n",
293-
" --max-steps {MAX_STEPS} \\\n",
294-
" --ckpt-dir nemo2_evo2_1b_8k \\\n",
295-
" --clip-grad 250 \\\n",
296-
" --wd 0.001 \\\n",
297-
" --attention-dropout 0.01 \\\n",
298-
" --hidden-dropout 0.01 \\\n",
299-
" --val-check-interval {val_check_interval} \\\n",
300-
" {model_subset_option} \\\n",
301-
" --create-tensorboard-logger \\\n",
302-
" --ckpt-async-save\"\"\"\n",
303-
"\n",
304-
"!{train_cmd}"
281+
" model_subset_option = [\"--activation-checkpoint-recompute-num-layers\", \"5\"]\n",
282+
"\n",
283+
"train_cmd = [\n",
284+
" \"train_evo2\",\n",
285+
" \"-d\",\n",
286+
" \"training_data_config.yaml\",\n",
287+
" \"--dataset-dir\",\n",
288+
" \"./preprocessed_data\",\n",
289+
" \"--result-dir\",\n",
290+
" \"pretraining_demo\",\n",
291+
" \"--experiment-name\",\n",
292+
" \"evo2\",\n",
293+
" \"--model-size\",\n",
294+
" \"1b\",\n",
295+
" \"--devices\",\n",
296+
" \"1\",\n",
297+
" \"--num-nodes\",\n",
298+
" \"1\",\n",
299+
" \"--seq-length\",\n",
300+
" \"8192\",\n",
301+
" \"--micro-batch-size\",\n",
302+
" \"2\",\n",
303+
" \"--lr\",\n",
304+
" \"0.000015\",\n",
305+
" \"--min-lr\",\n",
306+
" \"0.0000149\",\n",
307+
" \"--warmup-steps\",\n",
308+
" str(warmup_steps),\n",
309+
" \"--grad-acc-batches\",\n",
310+
" \"4\",\n",
311+
" \"--max-steps\",\n",
312+
" str(MAX_STEPS),\n",
313+
" \"--ckpt-dir\",\n",
314+
" \"nemo2_evo2_1b_8k\",\n",
315+
" \"--clip-grad\",\n",
316+
" \"250\",\n",
317+
" \"--wd\",\n",
318+
" \"0.001\",\n",
319+
" \"--attention-dropout\",\n",
320+
" \"0.01\",\n",
321+
" \"--hidden-dropout\",\n",
322+
" \"0.01\",\n",
323+
" \"--val-check-interval\",\n",
324+
" str(val_check_interval),\n",
325+
" \"--create-tensorboard-logger\",\n",
326+
" \"--ckpt-async-save\",\n",
327+
"]\n",
328+
"\n",
329+
"train_cmd.extend(model_subset_option)\n",
330+
"\n",
331+
"print(f\"Running command: {train_cmd}\")\n",
332+
"\n",
333+
"result = run_subprocess_safely(train_cmd)\n",
334+
"assert result[\"returncode\"] == 0, result"
305335
]
306336
},
307337
{

sub-packages/bionemo-evo2/examples/zeroshot_brca1.ipynb

Lines changed: 65 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,9 @@
5050
"import seaborn as sns\n",
5151
"import torch\n",
5252
"from Bio import SeqIO\n",
53-
"from sklearn.metrics import auc, roc_auc_score, roc_curve"
53+
"from sklearn.metrics import auc, roc_auc_score, roc_curve\n",
54+
"\n",
55+
"from bionemo.core.utils.subprocess_utils import run_subprocess_safely"
5456
]
5557
},
5658
{
@@ -662,24 +664,11 @@
662664
"# The Evo2 config has 'use_fp8_input_projections: True' by default\n",
663665
"\n",
664666
"if FAST_CI_MODE:\n",
665-
" model_subset_option = \"--num-layers 4 --hybrid-override-pattern SDH*\"\n",
667+
" model_subset_options = [\"--num-layers 4\", \"--hybrid-override-pattern SDH*\"]\n",
666668
"else:\n",
667-
" model_subset_option = \"\"\n",
668-
"\n",
669-
"fp8_option = \"--fp8\" if fp8_supported else \"\"\n",
670-
"\n",
671-
"# Update predict commands to run on the full dataset\n",
672-
"predict_ref_command = (\n",
673-
" f\"predict_evo2 --fasta {ref_fasta_path} --ckpt-dir {checkpoint_path} \"\n",
674-
" f\"--output-dir {predict_ref_dir} --model-size {MODEL_SIZE} --tensor-parallel-size 1 {model_subset_option} \"\n",
675-
" f\"--pipeline-model-parallel-size 1 --context-parallel-size 1 --output-log-prob-seqs {fp8_option}\"\n",
676-
")\n",
669+
" model_subset_options = []\n",
677670
"\n",
678-
"predict_var_command = (\n",
679-
" f\"predict_evo2 --fasta {var_fasta_path} --ckpt-dir {checkpoint_path} \"\n",
680-
" f\"--output-dir {predict_var_dir} --model-size {MODEL_SIZE} --tensor-parallel-size 1 {model_subset_option} \"\n",
681-
" f\"--pipeline-model-parallel-size 1 --context-parallel-size 1 --output-log-prob-seqs {fp8_option}\"\n",
682-
")"
671+
"fp8_option = \"--fp8\" if fp8_supported else \"\""
683672
]
684673
},
685674
{
@@ -696,8 +685,38 @@
696685
"outputs": [],
697686
"source": [
698687
"%%capture\n",
688+
"# Update predict commands to run on the full dataset\n",
689+
"# Update predict commands to run on the full dataset\n",
690+
"predict_ref_command = [\n",
691+
" \"predict_evo2\",\n",
692+
" \"--fasta\",\n",
693+
" ref_fasta_path,\n",
694+
" \"--ckpt-dir\",\n",
695+
" checkpoint_path,\n",
696+
" \"--output-dir\",\n",
697+
" predict_ref_dir,\n",
698+
" \"--model-size\",\n",
699+
" MODEL_SIZE,\n",
700+
" \"--tensor-parallel-size\",\n",
701+
" \"1\",\n",
702+
" \"--pipeline-model-parallel-size\",\n",
703+
" \"1\",\n",
704+
" \"--context-parallel-size\",\n",
705+
" \"1\",\n",
706+
" \"--output-log-prob-seqs\",\n",
707+
"]\n",
708+
"\n",
709+
"# Optional flags\n",
710+
"if model_subset_options:\n",
711+
" predict_ref_command.extend(model_subset_options)\n",
712+
"\n",
713+
"if fp8_option:\n",
714+
" predict_ref_command.append(fp8_option)\n",
715+
"\n",
699716
"print(f\"Running command: {predict_ref_command}\")\n",
700-
"!{predict_ref_command}"
717+
"\n",
718+
"result = run_subprocess_safely(predict_ref_command)\n",
719+
"assert result[\"returncode\"] == 0, result"
701720
]
702721
},
703722
{
@@ -714,8 +733,35 @@
714733
"outputs": [],
715734
"source": [
716735
"%%capture\n",
736+
"predict_var_command = [\n",
737+
" \"predict_evo2\",\n",
738+
" \"--fasta\",\n",
739+
" var_fasta_path,\n",
740+
" \"--ckpt-dir\",\n",
741+
" checkpoint_path,\n",
742+
" \"--output-dir\",\n",
743+
" predict_var_dir,\n",
744+
" \"--model-size\",\n",
745+
" MODEL_SIZE,\n",
746+
" \"--tensor-parallel-size\",\n",
747+
" \"1\",\n",
748+
" \"--pipeline-model-parallel-size\",\n",
749+
" \"1\",\n",
750+
" \"--context-parallel-size\",\n",
751+
" \"1\",\n",
752+
" \"--output-log-prob-seqs\",\n",
753+
"]\n",
754+
"\n",
755+
"if model_subset_options:\n",
756+
" predict_var_command.extend(model_subset_options) # make sure this is already a single flag or \"--key value\"\n",
757+
"\n",
758+
"if fp8_option:\n",
759+
" predict_var_command.append(fp8_option)\n",
760+
"\n",
717761
"print(f\"Running command: {predict_var_command}\")\n",
718-
"!{predict_var_command}"
762+
"\n",
763+
"result = run_subprocess_safely(predict_var_command)\n",
764+
"assert result[\"returncode\"] == 0, result"
719765
]
720766
},
721767
{

0 commit comments

Comments
 (0)