add slurm submit and wandb logging to train_voronoi_rf.py

janosh · janosh · commit 6e58a1b36390 · 2023-06-19T20:29:22.000-07:00
diff --git a/.gitignore b/.gitignore
@@ -21,9 +21,7 @@ job-logs/
 *slurm-*.log
 models/**/*.csv
 
-# temporary ignore rule
+# temporary ignore rules
 paper
 meeting-notes
-models/voronoi/*.ipynb
 models/voronoi/*.zip
-pretrained
diff --git a/matbench_discovery/plots.py b/matbench_discovery/plots.py
@@ -488,9 +488,9 @@ def wandb_log_scatter(
     """
     assert set(fields) >= {"x", "y"}, f"{fields=} must specify x and y column names"
 
-    if all("form" in field for field in fields.values()):
-        kwargs.setdefault("x", "DFT formation energy (eV/atom)")
-        kwargs.setdefault("y", "Predicted formation energy (eV/atom)")
+    if "form" in fields["x"] and "form" in fields["y"]:
+        kwargs.setdefault("x_label", "DFT formation energy (eV/atom)")
+        kwargs.setdefault("y_label", "Predicted formation energy (eV/atom)")
 
     scatter_plot = wandb.plot_table(
         vega_spec_name="janosh/scatter-parity",
diff --git a/matbench_discovery/slurm.py b/matbench_discovery/slurm.py
@@ -101,10 +101,11 @@ def slurm_submit(
     if pre_cmd:
         slurm_vars["pre_cmd"] = pre_cmd
 
+    # print sbatch command into slurm log file and at job submission time
+    # but not into terminal or Jupyter
     if (is_slurm_job and is_log_file) or "slurm-submit" in sys.argv:
-        # print sbatch command at submission time and into slurm log file
-        # but not when running in command line or Jupyter
         print(f"\n{' '.join(cmd)}\n".replace(" --", "\n  --"))
+    if is_slurm_job and is_log_file:
         for key, val in slurm_vars.items():
             print(f"{key}={val}")
 
diff --git a/models/voronoi/join_voronoi_features.py b/models/voronoi/join_voronoi_features.py
@@ -13,8 +13,8 @@
 
 # %%
 module_dir = os.path.dirname(__file__)
-date = "2022-11-18"
-glob_pattern = f"{date}-voronoi-features-wbm/voronoi-features-wbm-*.csv.bz2"
+date, data = "2022-11-25", "mp"
+glob_pattern = f"{date}-features-{data}/voronoi-features-{data}-*.csv.bz2"
 file_paths = sorted(glob(f"{module_dir}/{glob_pattern}"))
 print(f"Found {len(file_paths):,} files for {glob_pattern = }")
 
@@ -27,21 +27,17 @@
 for file_path in tqdm(file_paths):
     if file_path in dfs:
         continue
-    try:
-        # keep whole dataframe in memory
-        df = pd.read_csv(file_path).set_index("material_id")
-        dfs[file_path] = df
-    except FileNotFoundError:
-        print(f"{file_path=} not found")
-        continue
+    df = pd.read_csv(file_path).set_index("material_id")
+    dfs[file_path] = df
 
 
 # %%
 df_features = pd.concat(dfs.values())
 
-assert df_features.isna().sum().max() <= 18
+ax = df_features.isna().sum().value_counts().T.plot.bar()
+ax.set(xlabel="# NaNs", ylabel="# columns", title="NaNs per column")
 
 
 # %%
-out_path = f"{module_dir}/{date}-voronoi-features-wbm.csv.bz2"
+out_path = f"{module_dir}/{date}-features-{data}.csv.bz2"
 df_features.to_csv(out_path)
diff --git a/models/voronoi/train_test_voronoi_rf.py b/models/voronoi/train_test_voronoi_rf.py
@@ -0,0 +1,125 @@
+# %%
+import os
+from importlib.metadata import version
+
+import pandas as pd
+import wandb
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.impute import SimpleImputer
+from sklearn.metrics import r2_score
+from sklearn.pipeline import Pipeline
+
+from matbench_discovery import DEBUG, ROOT, today
+from matbench_discovery.plot_scripts import df_wbm
+from matbench_discovery.plots import wandb_log_scatter
+from matbench_discovery.slurm import slurm_submit
+from models.voronoi import featurizer
+
+__author__ = "Janosh Riebesell"
+__date__ = "2022-11-26"
+
+
+# %%
+module_dir = os.path.dirname(__file__)
+task_type = "IS2RE"
+print(f"{task_type=}")
+
+out_dir = f"{module_dir}/{today}-train-test"
+out_path = f"{out_dir}/e-form-preds-{task_type}.csv"
+if os.path.isfile(out_path):
+    raise SystemExit(f"{out_path = } already exists, exciting early")
+
+job_name = f"train-test-voronoi-rf{'-debug' if DEBUG else ''}"
+
+slurm_vars = slurm_submit(
+    job_name=job_name,
+    out_dir=out_dir,
+    partition="icelake-himem",
+    account="LEE-SL3-CPU",
+    time="6:0:0",
+)
+
+
+# %%
+train_path = f"{module_dir}/2022-11-25-features-mp.csv.bz2"
+print(f"{train_path=}")
+df_train = pd.read_csv(train_path).set_index("material_id")
+print(f"{df_train.shape=}")
+
+mp_energies_path = f"{ROOT}/data/mp/2022-08-13-mp-energies.json.gz"
+df_mp = pd.read_json(mp_energies_path).set_index("material_id")
+train_target_col = "formation_energy_per_atom"
+df_train[train_target_col] = df_mp[train_target_col]
+
+
+test_path = f"{module_dir}/2022-11-18-features-wbm-{task_type}.csv.bz2"
+print(f"{test_path=}")
+df_test = pd.read_csv(test_path).set_index("material_id")
+print(f"{df_test.shape=}")
+
+test_target_col = "e_form_per_atom_mp2020_corrected"
+df_test[test_target_col] = df_wbm[test_target_col]
+model_name = "Voronoi RandomForestRegressor"
+
+run_params = dict(
+    train_path=train_path,
+    test_path=test_path,
+    mp_energies_path=mp_energies_path,
+    scikit_learn_version=version("scikit-learn"),
+    matminer_version=version("matminer"),
+    model_name=model_name,
+    train_target_col=train_target_col,
+    test_target_col=test_target_col,
+    df_train=dict(shape=str(df_train.shape)),
+    df_test=dict(shape=str(df_test.shape)),
+    slurm_vars=slurm_vars,
+)
+
+wandb.init(project="matbench-discovery", name=job_name, config=run_params)
+
+
+# %%
+feature_names = featurizer.feature_labels()
+n_nans = df_train[feature_names].isna().any(axis=1).sum()
+
+print(f"train set NaNs: {n_nans:,} / {len(df_train):,} = {n_nans/len(df_train):.3%}")
+
+df_train = df_train.dropna(subset=feature_names)
+
+
+# %%
+model = Pipeline(
+    [
+        ("imputer", SimpleImputer()),  # For the failed structures
+        ("model", RandomForestRegressor(n_estimators=150, n_jobs=-1, verbose=1)),
+    ]
+)
+
+
+# %%
+model.fit(df_train[feature_names], df_train[train_target_col])
+
+
+# %%
+n_nans = df_test[feature_names].isna().any(axis=1).sum()
+print(f"test set NaNs: {n_nans:,} / {len(df_train):,} = {n_nans/len(df_train):.1%}")
+
+df_test = df_test.dropna(subset=feature_names)
+
+pred_col = "e_form_per_atom_voronoi_rf"
+df_test[pred_col] = model.predict(df_test[feature_names])
+df_wbm[pred_col] = df_test[pred_col]
+
+df_wbm[pred_col].to_csv(out_path)
+
+table = wandb.Table(
+    dataframe=df_wbm[["formula", test_target_col, pred_col]].reset_index()
+)
+
+df_wbm[pred_col].isna().sum()
+MAE = (df_wbm[test_target_col] - df_wbm[pred_col]).abs().mean()
+R2 = r2_score(*df_wbm[[test_target_col, pred_col]].dropna().to_numpy().T)
+title = f"{model_name} {task_type} {MAE=:.3} {R2=:.3}"
+print(title)
+
+wandb_log_scatter(table, fields=dict(x=test_target_col, y=pred_col), title=title)
diff --git a/models/voronoi/voronoi_featurize_dataset.py b/models/voronoi/voronoi_featurize_dataset.py
@@ -2,6 +2,7 @@
 import os
 import sys
 import warnings
+from importlib.metadata import version
 
 import numpy as np
 import pandas as pd
@@ -13,14 +14,18 @@
 from matbench_discovery.slurm import slurm_submit
 from models.voronoi import featurizer
 
+__author__ = "Janosh Riebesell"
+__date__ = "2022-10-31"
+
+
 data_name = "mp"  # "mp"
 if data_name == "wbm":
     data_path = f"{ROOT}/data/wbm/2022-10-19-wbm-init-structs.json.bz2"
-    input_col = "initial_structure"
 elif data_name == "mp":
     data_path = f"{ROOT}/data/mp/2022-09-16-mp-computed-structure-entries.json.gz"
-    input_col = "relaxed_structure"
 
+input_col = "initial_structure"
+# input_col = "relaxed_structure"
 debug = "slurm-submit" in sys.argv
 job_name = f"voronoi-features-{data_name}{'-debug' if DEBUG else ''}"
 module_dir = os.path.dirname(__file__)
@@ -55,7 +60,9 @@
 
 if data_name == "mp":  # extract structure dicts from ComputedStructureEntry
     struct_dicts = [x["structure"] for x in df_this_job.entry]
-if data_name == "wbm":
+elif data_name == "wbm" and input_col == "relaxed_structure":
+    struct_dicts = [x["structure"] for x in df_this_job.computed_structure_entry]
+elif data_name == "wbm" and input_col == "initial_structure":
     struct_dicts = df_this_job.initial_structure
 
 df_this_job[input_col] = [
@@ -70,6 +77,7 @@
     input_col=input_col,
     slurm_vars=slurm_vars,
     out_path=out_path,
+    matminer_version=version("matminer"),
 )
 
 wandb.init(project="matbench-discovery", name=run_name, config=run_params)