Skip to content

Commit 6e58a1b

Browse files
committed
add slurm submit and wandb logging to train_voronoi_rf.py
1 parent 0236c36 commit 6e58a1b

File tree

6 files changed

+150
-22
lines changed

6 files changed

+150
-22
lines changed

.gitignore

+1-3
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,7 @@ job-logs/
2121
*slurm-*.log
2222
models/**/*.csv
2323

24-
# temporary ignore rule
24+
# temporary ignore rules
2525
paper
2626
meeting-notes
27-
models/voronoi/*.ipynb
2827
models/voronoi/*.zip
29-
pretrained

matbench_discovery/plots.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -488,9 +488,9 @@ def wandb_log_scatter(
488488
"""
489489
assert set(fields) >= {"x", "y"}, f"{fields=} must specify x and y column names"
490490

491-
if all("form" in field for field in fields.values()):
492-
kwargs.setdefault("x", "DFT formation energy (eV/atom)")
493-
kwargs.setdefault("y", "Predicted formation energy (eV/atom)")
491+
if "form" in fields["x"] and "form" in fields["y"]:
492+
kwargs.setdefault("x_label", "DFT formation energy (eV/atom)")
493+
kwargs.setdefault("y_label", "Predicted formation energy (eV/atom)")
494494

495495
scatter_plot = wandb.plot_table(
496496
vega_spec_name="janosh/scatter-parity",

matbench_discovery/slurm.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -101,10 +101,11 @@ def slurm_submit(
101101
if pre_cmd:
102102
slurm_vars["pre_cmd"] = pre_cmd
103103

104+
# print sbatch command into slurm log file and at job submission time
105+
# but not into terminal or Jupyter
104106
if (is_slurm_job and is_log_file) or "slurm-submit" in sys.argv:
105-
# print sbatch command at submission time and into slurm log file
106-
# but not when running in command line or Jupyter
107107
print(f"\n{' '.join(cmd)}\n".replace(" --", "\n --"))
108+
if is_slurm_job and is_log_file:
108109
for key, val in slurm_vars.items():
109110
print(f"{key}={val}")
110111

models/voronoi/join_voronoi_features.py

+7-11
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@
1313

1414
# %%
1515
module_dir = os.path.dirname(__file__)
16-
date = "2022-11-18"
17-
glob_pattern = f"{date}-voronoi-features-wbm/voronoi-features-wbm-*.csv.bz2"
16+
date, data = "2022-11-25", "mp"
17+
glob_pattern = f"{date}-features-{data}/voronoi-features-{data}-*.csv.bz2"
1818
file_paths = sorted(glob(f"{module_dir}/{glob_pattern}"))
1919
print(f"Found {len(file_paths):,} files for {glob_pattern = }")
2020

@@ -27,21 +27,17 @@
2727
for file_path in tqdm(file_paths):
2828
if file_path in dfs:
2929
continue
30-
try:
31-
# keep whole dataframe in memory
32-
df = pd.read_csv(file_path).set_index("material_id")
33-
dfs[file_path] = df
34-
except FileNotFoundError:
35-
print(f"{file_path=} not found")
36-
continue
30+
df = pd.read_csv(file_path).set_index("material_id")
31+
dfs[file_path] = df
3732

3833

3934
# %%
4035
df_features = pd.concat(dfs.values())
4136

42-
assert df_features.isna().sum().max() <= 18
37+
ax = df_features.isna().sum().value_counts().T.plot.bar()
38+
ax.set(xlabel="# NaNs", ylabel="# columns", title="NaNs per column")
4339

4440

4541
# %%
46-
out_path = f"{module_dir}/{date}-voronoi-features-wbm.csv.bz2"
42+
out_path = f"{module_dir}/{date}-features-{data}.csv.bz2"
4743
df_features.to_csv(out_path)
+125
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
# %%
2+
import os
3+
from importlib.metadata import version
4+
5+
import pandas as pd
6+
import wandb
7+
from sklearn.ensemble import RandomForestRegressor
8+
from sklearn.impute import SimpleImputer
9+
from sklearn.metrics import r2_score
10+
from sklearn.pipeline import Pipeline
11+
12+
from matbench_discovery import DEBUG, ROOT, today
13+
from matbench_discovery.plot_scripts import df_wbm
14+
from matbench_discovery.plots import wandb_log_scatter
15+
from matbench_discovery.slurm import slurm_submit
16+
from models.voronoi import featurizer
17+
18+
__author__ = "Janosh Riebesell"
19+
__date__ = "2022-11-26"
20+
21+
22+
# %%
23+
module_dir = os.path.dirname(__file__)
24+
task_type = "IS2RE"
25+
print(f"{task_type=}")
26+
27+
out_dir = f"{module_dir}/{today}-train-test"
28+
out_path = f"{out_dir}/e-form-preds-{task_type}.csv"
29+
if os.path.isfile(out_path):
30+
raise SystemExit(f"{out_path = } already exists, exciting early")
31+
32+
job_name = f"train-test-voronoi-rf{'-debug' if DEBUG else ''}"
33+
34+
slurm_vars = slurm_submit(
35+
job_name=job_name,
36+
out_dir=out_dir,
37+
partition="icelake-himem",
38+
account="LEE-SL3-CPU",
39+
time="6:0:0",
40+
)
41+
42+
43+
# %%
44+
train_path = f"{module_dir}/2022-11-25-features-mp.csv.bz2"
45+
print(f"{train_path=}")
46+
df_train = pd.read_csv(train_path).set_index("material_id")
47+
print(f"{df_train.shape=}")
48+
49+
mp_energies_path = f"{ROOT}/data/mp/2022-08-13-mp-energies.json.gz"
50+
df_mp = pd.read_json(mp_energies_path).set_index("material_id")
51+
train_target_col = "formation_energy_per_atom"
52+
df_train[train_target_col] = df_mp[train_target_col]
53+
54+
55+
test_path = f"{module_dir}/2022-11-18-features-wbm-{task_type}.csv.bz2"
56+
print(f"{test_path=}")
57+
df_test = pd.read_csv(test_path).set_index("material_id")
58+
print(f"{df_test.shape=}")
59+
60+
test_target_col = "e_form_per_atom_mp2020_corrected"
61+
df_test[test_target_col] = df_wbm[test_target_col]
62+
model_name = "Voronoi RandomForestRegressor"
63+
64+
run_params = dict(
65+
train_path=train_path,
66+
test_path=test_path,
67+
mp_energies_path=mp_energies_path,
68+
scikit_learn_version=version("scikit-learn"),
69+
matminer_version=version("matminer"),
70+
model_name=model_name,
71+
train_target_col=train_target_col,
72+
test_target_col=test_target_col,
73+
df_train=dict(shape=str(df_train.shape)),
74+
df_test=dict(shape=str(df_test.shape)),
75+
slurm_vars=slurm_vars,
76+
)
77+
78+
wandb.init(project="matbench-discovery", name=job_name, config=run_params)
79+
80+
81+
# %%
82+
feature_names = featurizer.feature_labels()
83+
n_nans = df_train[feature_names].isna().any(axis=1).sum()
84+
85+
print(f"train set NaNs: {n_nans:,} / {len(df_train):,} = {n_nans/len(df_train):.3%}")
86+
87+
df_train = df_train.dropna(subset=feature_names)
88+
89+
90+
# %%
91+
model = Pipeline(
92+
[
93+
("imputer", SimpleImputer()), # For the failed structures
94+
("model", RandomForestRegressor(n_estimators=150, n_jobs=-1, verbose=1)),
95+
]
96+
)
97+
98+
99+
# %%
100+
model.fit(df_train[feature_names], df_train[train_target_col])
101+
102+
103+
# %%
104+
n_nans = df_test[feature_names].isna().any(axis=1).sum()
105+
print(f"test set NaNs: {n_nans:,} / {len(df_train):,} = {n_nans/len(df_train):.1%}")
106+
107+
df_test = df_test.dropna(subset=feature_names)
108+
109+
pred_col = "e_form_per_atom_voronoi_rf"
110+
df_test[pred_col] = model.predict(df_test[feature_names])
111+
df_wbm[pred_col] = df_test[pred_col]
112+
113+
df_wbm[pred_col].to_csv(out_path)
114+
115+
table = wandb.Table(
116+
dataframe=df_wbm[["formula", test_target_col, pred_col]].reset_index()
117+
)
118+
119+
df_wbm[pred_col].isna().sum()
120+
MAE = (df_wbm[test_target_col] - df_wbm[pred_col]).abs().mean()
121+
R2 = r2_score(*df_wbm[[test_target_col, pred_col]].dropna().to_numpy().T)
122+
title = f"{model_name} {task_type} {MAE=:.3} {R2=:.3}"
123+
print(title)
124+
125+
wandb_log_scatter(table, fields=dict(x=test_target_col, y=pred_col), title=title)

models/voronoi/voronoi_featurize_dataset.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import os
33
import sys
44
import warnings
5+
from importlib.metadata import version
56

67
import numpy as np
78
import pandas as pd
@@ -13,14 +14,18 @@
1314
from matbench_discovery.slurm import slurm_submit
1415
from models.voronoi import featurizer
1516

17+
__author__ = "Janosh Riebesell"
18+
__date__ = "2022-10-31"
19+
20+
1621
data_name = "mp" # "mp"
1722
if data_name == "wbm":
1823
data_path = f"{ROOT}/data/wbm/2022-10-19-wbm-init-structs.json.bz2"
19-
input_col = "initial_structure"
2024
elif data_name == "mp":
2125
data_path = f"{ROOT}/data/mp/2022-09-16-mp-computed-structure-entries.json.gz"
22-
input_col = "relaxed_structure"
2326

27+
input_col = "initial_structure"
28+
# input_col = "relaxed_structure"
2429
debug = "slurm-submit" in sys.argv
2530
job_name = f"voronoi-features-{data_name}{'-debug' if DEBUG else ''}"
2631
module_dir = os.path.dirname(__file__)
@@ -55,7 +60,9 @@
5560

5661
if data_name == "mp": # extract structure dicts from ComputedStructureEntry
5762
struct_dicts = [x["structure"] for x in df_this_job.entry]
58-
if data_name == "wbm":
63+
elif data_name == "wbm" and input_col == "relaxed_structure":
64+
struct_dicts = [x["structure"] for x in df_this_job.computed_structure_entry]
65+
elif data_name == "wbm" and input_col == "initial_structure":
5966
struct_dicts = df_this_job.initial_structure
6067

6168
df_this_job[input_col] = [
@@ -70,6 +77,7 @@
7077
input_col=input_col,
7178
slurm_vars=slurm_vars,
7279
out_path=out_path,
80+
matminer_version=version("matminer"),
7381
)
7482

7583
wandb.init(project="matbench-discovery", name=run_name, config=run_params)

0 commit comments

Comments
 (0)