add slurm_submit_python() to use_cgcnn_ensemble.py and use_wrenformer_ensemble.py

janosh · janosh · commit 5d9a3b2b93e8 · 2023-06-19T20:29:21.000-07:00
auto-load default ampere partition in slurm_submit_python() if partition contains 'GPU'
diff --git a/matbench_discovery/slurm.py b/matbench_discovery/slurm.py
@@ -50,13 +50,20 @@ def slurm_submit_python(
         pre_cmd (str, optional): Things like `module load` commands and environment
             variables to set when running the python script go here. Example:
             pre_cmd='ENV_VAR=42' or 'module load rhel8/default-amp;'. Defaults to "".
+            If running on CPU, pre_cmd="unset OMP_NUM_THREADS" allows PyTorch to use
+            all cores https://docs.hpc.cam.ac.uk/hpc/software-packages/pytorch.html
 
     Raises:
         SystemExit: Exit code will be subprocess.run(['sbatch', ...]).returncode.
     """
     if py_file_path is None:
         py_file_path = _get_calling_file_path(frame=2)
 
+    if "GPU" in partition:
+        # on Ampere GPU partition, source module CLI and load default Ampere env
+        # before actual job command
+        pre_cmd += ". /etc/profile.d/modules.sh; module load rhel8/default-amp;"
+
     cmd = [
         *f"sbatch --{partition=} --{account=} --{time=}".replace("'", "").split(),
         *("--job-name", job_name),
@@ -72,7 +79,7 @@ def slurm_submit_python(
     if (is_slurm_job and is_log_file) or "slurm-submit" in sys.argv:
         # print sbatch command at submission time and into slurm log file
         # but not when running in command line or Jupyter
-        print(" ".join(cmd))
+        print(f"\n{' '.join(cmd)}\n")
 
     if "slurm-submit" not in sys.argv:
         return
diff --git a/models/cgcnn/slurm_train_cgcnn_ensemble.py b/models/cgcnn/slurm_train_cgcnn_ensemble.py
@@ -40,9 +40,6 @@
     array=f"1-{n_folds}",
     log_dir=log_dir,
     slurm_flags=("--nodes", "1", "--gpus-per-node", "1"),
-    # prepend into sbatch script to source module command and load default env
-    # for Ampere GPU partition before actual job command
-    pre_cmd=". /etc/profile.d/modules.sh; module load rhel8/default-amp;",
 )
 
 
diff --git a/models/cgcnn/use_cgcnn_ensemble.py b/models/cgcnn/use_cgcnn_ensemble.py
@@ -15,6 +15,7 @@
 
 from matbench_discovery import ROOT
 from matbench_discovery.plot_scripts import df_wbm
+from matbench_discovery.slurm import slurm_submit_python
 
 __author__ = "Janosh Riebesell"
 __date__ = "2022-08-15"
@@ -27,27 +28,41 @@
 
 module_dir = os.path.dirname(__file__)
 today = f"{datetime.now():%Y-%m-%d}"
+ensemble_id = "cgcnn-e_form-ensemble-1"
+run_name = f"{today}-{ensemble_id}-IS2RE"
+
+slurm_submit_python(
+    job_name=run_name,
+    partition="ampere",
+    account="LEE-SL3-GPU",
+    time="1:0:0",
+    log_dir=module_dir,
+    slurm_flags=("--nodes", "1", "--gpus-per-node", "1"),
+)
 
 
 # %%
 data_path = f"{ROOT}/data/wbm/2022-10-19-wbm-init-structs.json.bz2"
 df = pd.read_json(data_path).set_index("material_id", drop=False)
 old_len = len(df)
+no_init_structs = df.query("initial_structure.isnull()").index
 df = df.dropna()  # two missing initial structures
 assert len(df) == old_len - 2
 
+assert all(
+    df.index == df_wbm.drop(index=no_init_structs).index
+), "df and df_wbm must have same index"
 df["e_form_per_atom_mp2020_corrected"] = df_wbm.e_form_per_atom_mp2020_corrected
 
 target_col = "e_form_per_atom_mp2020_corrected"
 input_col = "initial_structure"
 assert target_col in df, f"{target_col=} not in {list(df)}"
 assert input_col in df, f"{input_col=} not in {list(df)}"
 
-df[input_col] = [Structure.from_dict(x) for x in tqdm(df[input_col])]
+df[input_col] = [Structure.from_dict(x) for x in tqdm(df[input_col], disable=None)]
 
 wandb.login()
 wandb_api = wandb.Api()
-ensemble_id = "cgcnn-e_form-ensemble-1"
 runs = wandb_api.runs(
     "janosh/matbench-discovery", filters={"tags": {"$in": [ensemble_id]}}
 )
@@ -62,10 +77,11 @@
 )
 df, ensemble_metrics = predict_from_wandb_checkpoints(
     runs,
-    df=cg_data.df,  # dropping isolated-atom structs means len(cg_data.df) < len(df)
+    # dropping isolated-atom structs means len(cg_data.df) < len(df)
+    df=cg_data.df.reset_index(drop=True).drop(columns=input_col),
     target_col=target_col,
-    model_class=CrystalGraphConvNet,
+    model_cls=CrystalGraphConvNet,
     data_loader=data_loader,
 )
 
-df.round(6).to_csv(f"{module_dir}/{today}-{ensemble_id}-preds-{target_col}.csv")
+df.round(6).to_csv(f"{module_dir}/{today}-{run_name}-preds.csv")
diff --git a/models/wrenformer/mp/use_wrenformer_ensemble.py b/models/wrenformer/mp/use_wrenformer_ensemble.py
@@ -10,6 +10,8 @@
 from aviary.wrenformer.data import df_to_in_mem_dataloader
 from aviary.wrenformer.model import Wrenformer
 
+from matbench_discovery.slurm import slurm_submit_python
+
 __author__ = "Janosh Riebesell"
 __date__ = "2022-08-15"
 
@@ -21,6 +23,17 @@
 
 module_dir = os.path.dirname(__file__)
 today = f"{datetime.now():%Y-%m-%d}"
+ensemble_id = "wrenformer-e_form-ensemble-1"
+run_name = f"{today}-{ensemble_id}-IS2RE"
+
+slurm_submit_python(
+    job_name=run_name,
+    partition="ampere",
+    account="LEE-SL3-GPU",
+    time="1:0:0",
+    log_dir=module_dir,
+    slurm_flags=("--nodes", "1", "--gpus-per-node", "1"),
+)
 
 
 # %%
@@ -35,7 +48,6 @@
 
 wandb.login()
 wandb_api = wandb.Api()
-ensemble_id = "wrenformer-e_form-ensemble-1"
 runs = wandb_api.runs(
     "janosh/matbench-discovery", filters={"tags": {"$in": [ensemble_id]}}
 )
@@ -52,7 +64,7 @@
 )
 
 df, ensemble_metrics = predict_from_wandb_checkpoints(
-    runs, data_loader, df=df, model_class=Wrenformer
+    runs, data_loader=data_loader, df=df, model_cls=Wrenformer
 )
 
-df.round(6).to_csv(f"{module_dir}/{today}-{ensemble_id}-preds-{target_col}.csv")
+df.round(6).to_csv(f"{module_dir}/{today}-{run_name}-preds.csv")
diff --git a/models/wrenformer/slurm_train_wrenformer_ensemble.py b/models/wrenformer/slurm_train_wrenformer_ensemble.py
@@ -33,12 +33,6 @@
     array=f"1-{n_folds}",
     log_dir=log_dir,
     slurm_flags=("--nodes", "1", "--gpus-per-node", "1"),
-    # prepend into sbatch script to source module command and load default env
-    # for Ampere GPU partition before actual job command
-    pre_cmd=". /etc/profile.d/modules.sh; module load rhel8/default-amp;",
-    # if running on CPU, unsetting OMP threads allows using PyTorch to use all cores
-    # https://docs.hpc.cam.ac.uk/hpc/software-packages/pytorch.html
-    # pre_cmd="unset OMP_NUM_THREADS",
 )
 
 

Original file line number	Diff line number	Diff line change
`@@ -40,9 +40,6 @@`
`40`	`40`	`array=f"1-{n_folds}",`
`41`	`41`	`log_dir=log_dir,`
`42`	`42`	`slurm_flags=("--nodes", "1", "--gpus-per-node", "1"),`
`43`		`- # prepend into sbatch script to source module command and load default env`
`44`		`- # for Ampere GPU partition before actual job command`
`45`		`- pre_cmd=". /etc/profile.d/modules.sh; module load rhel8/default-amp;",`
`46`	`43`	`)`
`47`	`44`
`48`	`45`