add models/voronoi/join_voronoi_features.py

janosh · janosh · commit b3c3abaf56a1 · 2023-06-19T20:29:21.000-07:00
also add models/voronoi/readme.md to document OOM errors and set_n_jobs(1) solution
diff --git a/.gitignore b/.gitignore
@@ -24,6 +24,6 @@ models/**/*.csv
 # temporary ignore rule
 paper
 meeting-notes
-models/voronoi/*
-!models/voronoi/*.py
+models/voronoi/*.ipynb
+models/voronoi/*.zip
 pretrained
diff --git a/models/m3gnet/test_m3gnet.py b/models/m3gnet/test_m3gnet.py
@@ -67,7 +67,7 @@
 
 # %%
 data_path = f"{ROOT}/data/wbm/2022-10-19-wbm-cses+init-structs.json.bz2"
-print(f"Loading from {data_path=}")
+print(f"{data_path=}")
 df_wbm = pd.read_json(data_path).set_index("material_id")
 
 df_this_job: pd.DataFrame = np.array_split(df_wbm, slurm_array_task_count)[
diff --git a/models/megnet/test_megnet.py b/models/megnet/test_megnet.py
@@ -53,7 +53,7 @@
 
 # %%
 data_path = f"{ROOT}/data/wbm/2022-10-19-wbm-init-structs.json.bz2"
-print(f"Loading from {data_path=}")
+print(f"{data_path=}")
 df_wbm_structs = pd.read_json(data_path).set_index("material_id")
 
 megnet_mp_e_form = load_model(model_name := "Eform_MP_2019")
diff --git a/models/voronoi/featurize_mp_wbm.py b/models/voronoi/featurize_mp_wbm.py
@@ -19,8 +19,8 @@
 module_dir = os.path.dirname(__file__)
 
 
-# data_path = f"{ROOT}/data/mp/2022-09-16-mp-computed-structure-entries.json.gz"
-data_path = f"{ROOT}/data/wbm/2022-10-19-wbm-init-structs.json.bz2"
+data_path = f"{ROOT}/data/mp/2022-09-16-mp-computed-structure-entries.json.gz"
+# data_path = f"{ROOT}/data/wbm/2022-10-19-wbm-init-structs.json.bz2"
 input_col = "initial_structure"
 data_name = "wbm" if "wbm" in data_path else "mp"
 slurm_array_task_count = 10
@@ -31,7 +31,7 @@
     job_name=job_name,
     partition="icelake-himem",
     account="LEE-SL3-CPU",
-    time=(slurm_max_job_time := "5:0:0"),
+    time=(slurm_max_job_time := "8:0:0"),
     array=f"1-{slurm_array_task_count}",
     log_dir=log_dir,
 )
@@ -45,12 +45,13 @@
 if os.path.isfile(out_path):
     raise SystemExit(f"{out_path = } already exists, exciting early")
 
+print(f"{data_path=}")
 df = pd.read_json(data_path).set_index("material_id")
 df_this_job: pd.DataFrame = np.array_split(df, slurm_array_task_count)[
     slurm_array_task_id - 1
 ]
 
-if data_name == "mp":
+if data_name == "mp":  # extract structure dicts from ComputedStructureEntry
     struct_dicts = [x["structure"] for x in df_this_job.entry]
 if data_name == "wbm":
     struct_dicts = df_this_job.initial_structure
diff --git a/models/voronoi/join_voronoi_features.py b/models/voronoi/join_voronoi_features.py
@@ -0,0 +1,50 @@
+# %%
+from __future__ import annotations
+
+import os
+from datetime import datetime
+from glob import glob
+
+import pandas as pd
+from tqdm import tqdm
+
+__author__ = "Janosh Riebesell"
+__date__ = "2022-08-16"
+
+today = f"{datetime.now():%Y-%m-%d}"
+
+
+# %%
+module_dir = os.path.dirname(__file__)
+date = "2022-11-18"
+glob_pattern = f"{date}-voronoi-features-wbm/voronoi-features-wbm-*.csv.bz2"
+file_paths = sorted(glob(f"{module_dir}/{glob_pattern}"))
+print(f"Found {len(file_paths):,} files for {glob_pattern = }")
+
+dfs: dict[str, pd.DataFrame] = {}
+
+
+# %%
+# 2022-08-16 tried multiprocessing.Pool() to load files in parallel but was somehow
+# slower than serial loading
+for file_path in tqdm(file_paths):
+    if file_path in dfs:
+        continue
+    try:
+        # keep whole dataframe in memory
+        df = pd.read_csv(file_path).set_index("material_id")
+        dfs[file_path] = df
+    except FileNotFoundError:
+        print(f"{file_path=} not found")
+        continue
+
+
+# %%
+df_features = pd.concat(dfs.values())
+
+assert df_features.isna().sum().max() <= 18
+
+
+# %%
+out_path = f"{module_dir}/{date}-voronoi-features-wbm.csv.bz2"
+df_features.to_csv(out_path)
diff --git a/models/voronoi/readme.md b/models/voronoi/readme.md
@@ -0,0 +1,17 @@
+# Voronoi Tessellation with matminer featurezation piped into `scikit-learn` Random Forest
+
+## OOM errors during featurization
+
+`multiprocessing` seems to be the cause of out-of-memory errors on large structures. Initially couldn't get the `matminer` `MultipleFeaturizer` to run without crashing even when running on small subsets of the data (1%) and setting `sbatch` flag `--mem 100G`:
+
+```log
+MultipleFeaturizer:  28%|██▊       | 724/2575 [01:08<04:15,  7.25it/s]/var/spool/slurm/slurmd/job7401930/slurm_script: line 4: 2625851 Killed                  python
+slurmstepd: error: Detected 52 oom-kill event(s) in StepId=7401930.batch cgroup. Some of your processes may have been killed by the cgroup out-of-memory handler.
+4:00
+```
+
+Saving tip came from [Alex Dunn via Slack](https://berkeleytheory.slack.com/archives/D03ULSTNRMX/p1668746161675349) to set `featurizer.set_n_jobs(1)`.
+
+## Archive
+
+Files in `2022-10-04-rhys-voronoi.zip` received from Rhys via [Slack](https://ml-physics.slack.com/archives/DD8GBBRLN/p1664929946687049). All originals before making any changes for this project.