add slurm_submit_python() in new module mb_discovery/slurm.py

janosh · janosh · commit 97c6949fcf9a · 2023-06-19T20:29:21.000-07:00
diff --git a/.gitignore b/.gitignore
@@ -21,6 +21,9 @@ job-logs/
 # slurm logs
 slurm-*out
 models/**/*.csv
+mb_discovery/energy/**/*.csv
 
 # temporary ignore rule
 paper
+meeting-notes
+models/voronoi
diff --git a/mb_discovery/slurm.py b/mb_discovery/slurm.py
@@ -0,0 +1,70 @@
+import os
+import subprocess
+import sys
+from collections.abc import Sequence
+
+
+def _get_calling_file_path(frame: int = 1) -> str:
+    """Return calling file's path.
+
+    Args:
+        frame (int, optional): How many function call's up? Defaults to 1.
+
+    Returns:
+        str: Calling function's file path n frames up the stack.
+    """
+    caller_path = sys._getframe(frame).f_code.co_filename
+    return caller_path
+
+
+def slurm_submit_python(
+    job_name: str,
+    log_dir: str,
+    time: str,
+    py_file_path: str = None,
+    slurm_flags: Sequence[str] = (),
+    partition: str = "icelake",
+    account: str = "LEE-SL3-CPU",
+    array: str = "",
+    env_vars: str = "",
+) -> None:
+    """Slurm submit a python script using sbatch --wrap 'python path/to/file.py' by
+    calling this function in the script and invoking the script with
+    `python path/to/file.py slurm-submit`.
+
+    Args:
+        job_name (str): Slurm job name.
+        log_dir (str): Directory to write slurm logs. Log file will include job ID and
+            array task ID.
+        time (str): 'HH:MM:SS' time limit for the job.
+        py_file_path (str): Path to the python script to be submitted. Defaults to the
+            path of the file calling slurm_submit_python().
+        slurm_flags (Sequence[str], optional): Extra slurm CLI flags. Defaults to ().
+        partition (str, optional): Slurm partition. Defaults to "icelake".
+        account (str, optional): Account to charge for this job.
+            Defaults to "LEE-SL3-CPU".
+        array (str, optional): Slurm array specifier. Defaults to "".
+        env_vars (str, optional): Environment variables to set when running the python
+            script, e.g. ENV_VAR=42 python path/to/file.py. Defaults to "".
+
+    Raises:
+        SystemExit: Exit code will be subprocess.run(['sbatch', ...]).returncode.
+    """
+    if "slurm-submit" not in sys.argv:
+        return
+    os.makedirs(log_dir, exist_ok=True)  # slurm fails if log_dir is missing
+
+    # calling file's path.
+    if py_file_path is None:
+        py_file_path = _get_calling_file_path(frame=2)
+
+    cmd = [
+        *f"sbatch --{partition=} --{account=} --{time=} --{array=}".split(),
+        *("--job-name", job_name),
+        *("--output", f"{log_dir}/slurm-%A-%a.out"),
+        *slurm_flags,
+        *("--wrap", f"'{env_vars} python {py_file_path}'"),
+    ]
+    result = subprocess.run(cmd, check=True)
+
+    raise SystemExit(result.returncode)
diff --git a/models/bowsr/join_bowsr_results.py b/models/bowsr/join_bowsr_results.py
@@ -30,8 +30,6 @@
 
 
 # %%
-# 2022-08-16 tried multiprocessing.Pool() to load files in parallel but was somehow
-# slower than serial loading
 for file_path in tqdm(file_paths):
     if file_path in dfs:
         continue
diff --git a/models/bowsr/slurm_array_bowsr_wbm.py b/models/bowsr/slurm_array_bowsr_wbm.py
@@ -15,57 +15,61 @@
 from tqdm import tqdm
 
 from mb_discovery import ROOT, as_dict_handler
+from mb_discovery.slurm import slurm_submit_python
 
-"""
-To slurm submit this file, use
-
-```sh
-log_dir=models/bowsr/$(date +"%Y-%m-%d")-bowsr-megnet-wbm
-job_name=bowsr-megnet-wbm-IS2RE
-mkdir -p $log_dir # slurm fails if log_dir is missing
-
-sbatch --partition icelake-himem --account LEE-SL3-CPU --array 1-500 \
-    --time 12:0:0 --job-name $job_name --mem 12000 \
-    --output $log_dir/slurm-%A-%a.out \
-    --wrap "TF_CPP_MIN_LOG_LEVEL=2 python models/bowsr/slurm_array_bowsr_wbm.py"
-```
+__author__ = "Janosh Riebesell"
+__date__ = "2022-08-15"
 
---time 2h is probably enough but missing indices are annoying so best be safe.
---mem 12000 avoids slurmstepd: error: Detected 1 oom-kill event(s)
-    Some of your processes may have been killed by the cgroup out-of-memory handler.
+"""
+To slurm submit this file, run:
 
-TF_CPP_MIN_LOG_LEVEL=2 means INFO and WARNING logs are not printed
-https://stackoverflow.com/a/40982782
+python path/to/file.py slurm-submit
 
 Requires MEGNet and MAML installation: pip install megnet maml
 """
 
-__author__ = "Janosh Riebesell"
-__date__ = "2022-08-15"
-
+task_type = "IS2RE"  # "RS2RE"
+today = f"{datetime.now():%Y-%m-%d}"
+module_dir = os.path.dirname(__file__)
+# --mem 12000 avoids slurmstepd: error: Detected 1 oom-kill event(s)
+#     Some of your processes may have been killed by the cgroup out-of-memory handler.
+slurm_mem_per_node = 12000
+# set large job array size for fast testing/debugging
+slurm_array_task_count = 500
+out_dir = f"{module_dir}/{today}-bowsr-megnet-wbm-{task_type}"
 
-task_type = "IS2RE"
-# task_type = "RS2RE"
 data_path = f"{ROOT}/data/2022-06-26-wbm-cses-and-initial-structures.json.gz"
 
-module_dir = os.path.dirname(__file__)
+slurm_submit_python(
+    job_name=f"bowsr-megnet-wbm-{task_type}",
+    log_dir=out_dir,
+    time=(slurm_max_job_time := "3:0:0"),
+    # --time 2h is probably enough but best be safe.
+    array=f"1-{slurm_array_task_count}",
+    slurm_flags=("--mem", str(slurm_mem_per_node)),
+    partition="icelake-himem",
+    # TF_CPP_MIN_LOG_LEVEL=2 means INFO and WARNING logs are not printed
+    # https://stackoverflow.com/a/40982782
+    env_vars="TF_CPP_MIN_LOG_LEVEL=2",
+)
+
+
+# %%
 slurm_job_id = os.environ.get("SLURM_JOB_ID", "debug")
 slurm_array_task_id = int(os.environ.get("SLURM_ARRAY_TASK_ID", 0))
-# set large fallback job array size for fast testing/debugging
-slurm_array_task_count = int(os.environ.get("SLURM_ARRAY_TASK_COUNT", 10_000))
+out_path = f"{out_dir}/{slurm_array_task_id}.json.gz"
 
 print(f"Job started running {datetime.now():%Y-%m-%d@%H-%M}")
 print(f"{slurm_job_id = }")
 print(f"{slurm_array_task_id = }")
+print(f"{data_path = }")
+print(f"{out_path = }")
 print(f"{version('maml') = }")
 print(f"{version('megnet') = }")
 
-today = f"{datetime.now():%Y-%m-%d}"
-out_dir = f"{module_dir}/{today}-bowsr-megnet-wbm-{task_type}"
-json_out_path = f"{out_dir}/{slurm_array_task_id}.json.gz"
 
-if os.path.isfile(json_out_path):
-    raise SystemExit(f"{json_out_path = } already exists, exciting early")
+if os.path.isfile(out_path):
+    raise SystemExit(f"{out_path = } already exists, exciting early")
 
 
 # %%
@@ -78,14 +82,16 @@
 optimize_kwargs = dict(n_init=100, n_iter=100, alpha=0.026**2)
 
 run_params = dict(
-    megnet_version=version("megnet"),
-    maml_version=version("maml"),
-    slurm_job_id=slurm_job_id,
-    slurm_array_task_id=slurm_array_task_id,
-    slurm_array_task_count=slurm_array_task_count,
-    data_path=data_path,
     bayes_optim_kwargs=bayes_optim_kwargs,
+    data_path=data_path,
+    maml_version=version("maml"),
+    megnet_version=version("megnet"),
     optimize_kwargs=optimize_kwargs,
+    slurm_array_task_count=slurm_array_task_count,
+    slurm_array_task_id=slurm_array_task_id,
+    slurm_job_id=slurm_job_id,
+    slurm_max_job_time=slurm_max_job_time,
+    slurm_mem_per_node=slurm_mem_per_node,
     task_type=task_type,
 )
 if wandb.run is None:
@@ -127,7 +133,7 @@
 
 
 for material_id, structure in tqdm(
-    structures.items(), desc="Main loop", total=len(structures)
+    structures.items(), desc="Main loop", total=len(structures), disable=None
 ):
     if material_id in relax_results:
         continue
@@ -154,6 +160,6 @@
 df_output = pd.DataFrame(relax_results).T
 df_output.index.name = "material_id"
 
-df_output.reset_index().to_json(json_out_path, default_handler=as_dict_handler)
+df_output.reset_index().to_json(out_path, default_handler=as_dict_handler)
 
-wandb.log_artifact(json_out_path, type=f"bowsr-megnet-wbm-{task_type}")
+wandb.log_artifact(out_path, type=f"bowsr-megnet-wbm-{task_type}")
diff --git a/models/m3gnet/slurm_array_m3gnet_relax_wbm.py b/models/m3gnet/slurm_array_m3gnet_relax_wbm.py
@@ -11,37 +11,43 @@
 import pandas as pd
 import wandb
 from m3gnet.models import Relaxer
+from tqdm import tqdm
 
 from mb_discovery import ROOT, as_dict_handler
+from mb_discovery.slurm import slurm_submit_python
 
 """
-To slurm submit this file, use
+To slurm submit this file, run:
 
-```sh
-job_name=m3gnet-wbm-relax-IS2RE
-log_dir=models/m3gnet/$(date +"%Y-%m-%d")-$job_name
-mkdir -p $log_dir # slurm fails if log_dir is missing
-
-sbatch --partition icelake-himem --account LEE-SL3-CPU --array 1-100 \
-    --time 3:0:0 --job-name $job_name --mem 12000 \
-    --output $log_dir/slurm-%A-%a.out \
-    --wrap "TF_CPP_MIN_LOG_LEVEL=2 python models/m3gnet/slurm_array_m3gnet_relax_wbm.py"
-```
-
---time 2h is probably enough but missing indices are annoying so best be safe.
-
-TF_CPP_MIN_LOG_LEVEL=2 means INFO and WARNING logs are not printed
-https://stackoverflow.com/a/40982782
+python path/to/file.py slurm-submit
 
 Requires M3GNet installation: pip install m3gnet
 """
 
 __author__ = "Janosh Riebesell"
 __date__ = "2022-08-15"
 
-task_type = "IS2RE"
-# task_type = "RS2RE"
+task_type = "IS2RE"  # "RS2RE"
+today = f"{datetime.now():%Y-%m-%d}"
+module_dir = os.path.dirname(__file__)
+slurm_array_task_count = 100
+slurm_mem_per_node = 12000
+job_name = f"m3gnet-wbm-relax-{task_type}"
+out_dir = f"{module_dir}/{today}-{job_name}"
+
+slurm_submit_python(
+    job_name=job_name,
+    log_dir=out_dir,
+    time=(slurm_max_job_time := "3:0:0"),
+    array=f"1-{slurm_array_task_count}",
+    slurm_flags=("--mem", str(slurm_mem_per_node)),
+    # TF_CPP_MIN_LOG_LEVEL=2 means INFO and WARNING logs are not printed
+    # https://stackoverflow.com/a/40982782
+    env_vars="TF_CPP_MIN_LOG_LEVEL=2",
+)
+
 
+# %%
 slurm_job_id = os.environ.get("SLURM_JOB_ID", "debug")
 slurm_array_task_id = int(os.environ.get("SLURM_ARRAY_TASK_ID", 0))
 # set large fallback job array size for fast testing/debugging
@@ -52,8 +58,6 @@
 print(f"{slurm_array_task_id = }")
 print(f"{version('m3gnet') = }")
 
-today = f"{datetime.now():%Y-%m-%d}"
-out_dir = f"{ROOT}/data/{today}-m3gnet-wbm-{task_type}"
 json_out_path = f"{out_dir}/{slurm_array_task_id}.json.gz"
 
 if os.path.isfile(json_out_path):
@@ -71,19 +75,21 @@
 df_this_job = np.array_split(df_wbm, slurm_array_task_count)[slurm_array_task_id - 1]
 
 run_params = dict(
+    data_path=data_path,
     m3gnet_version=version("m3gnet"),
-    slurm_job_id=slurm_job_id,
-    slurm_array_task_id=slurm_array_task_id,
     slurm_array_task_count=slurm_array_task_count,
-    data_path=data_path,
+    slurm_array_task_id=slurm_array_task_id,
+    slurm_job_id=slurm_job_id,
+    slurm_max_job_time=slurm_max_job_time,
+    slurm_mem_per_node=slurm_mem_per_node,
     task_type=task_type,
 )
 if wandb.run is None:
     wandb.login()
 
 wandb.init(
     project="m3gnet",
-    name=f"m3gnet-wbm-relax-{task_type}-{slurm_job_id}-{slurm_array_task_id}",
+    name=f"{job_name}-{slurm_job_id}-{slurm_array_task_id}",
     config=run_params,
 )
 
@@ -105,7 +111,7 @@
     raise ValueError(f"Unknown {task_type = }")
 
 
-for material_id, structure in structures.items():
+for material_id, structure in tqdm(structures.items(), disable=None):
     if material_id in relax_results:
         continue
     relax_result = relaxer.relax(structure)
diff --git a/tests/test_slurm.py b/tests/test_slurm.py
@@ -0,0 +1,31 @@
+import sys
+from unittest.mock import patch
+
+import pytest
+
+from mb_discovery.slurm import _get_calling_file_path, slurm_submit_python
+
+
+def test_slurm_submit() -> None:
+
+    sys.argv += ["slurm-submit"]
+    with pytest.raises(SystemExit) as exc_info, patch(
+        "mb_discovery.slurm.subprocess.run"
+    ) as mock_run:
+        slurm_submit_python(
+            job_name="test_job",
+            log_dir="test_log_dir",
+            time="0:0:1",
+            slurm_flags=("--test_flag",),
+        )
+        assert exc_info.value.code == 0
+        assert mock_run.call_count == 1
+
+
+def test_get_calling_file_path() -> None:
+    assert _get_calling_file_path(frame=1) == __file__
+
+    def wrapper(frame: int) -> str:
+        return _get_calling_file_path(frame)
+
+    assert wrapper(frame=2) == __file__