delete outdated 'from matbench_discovery import DEBUG'

janosh · janosh · commit a9386fb3549c · 2023-08-10T18:48:33.000-07:00
import col names from matbench_discovery.preds
add license
rm .gitmodules
diff --git a/.gitmodules b/.gitmodules
diff --git a/data/wbm/eda.py b/data/wbm/eda.py
@@ -94,8 +94,7 @@
 
 
 # %% histogram of energy above MP convex hull for WBM
-col = "e_above_hull_mp2020_corrected_ppd_mp"
-# col = "e_form_per_atom_mp2020_corrected"
+col = each_true_col  # or e_form_col
 mean, std = df_wbm[col].mean(), df_wbm[col].std()
 
 range_x = (mean - 2 * std, mean + 2 * std)
diff --git a/license b/license
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 Janosh Riebesell
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+The software is provided "as is", without warranty of any kind, express or
+implied, including but not limited to the warranties of merchantability,
+fitness for a particular purpose and noninfringement. In no event shall the
+authors or copyright holders be liable for any claim, damages or other
+liability, whether in an action of contract, tort or otherwise, arising from,
+out of or in connection with the software or the use or other dealings in the
+software.
diff --git a/models/alignn/test_alignn.py b/models/alignn/test_alignn.py
@@ -20,6 +20,7 @@
 from matbench_discovery import today
 from matbench_discovery.data import DATA_FILES, df_wbm
 from matbench_discovery.plots import wandb_scatter
+from matbench_discovery.preds import e_form_col
 from matbench_discovery.slurm import slurm_submit
 
 __author__ = "Janosh Riebesell, Philipp Benner"
@@ -33,7 +34,7 @@
 # TODO fix this to load checkpoint from figshare
 # model_name = f"{module_dir}/data-train-result/best-model.pth"
 task_type = "IS2RE"
-target_col = "e_form_per_atom_mp2020_corrected"
+target_col = e_form_col
 input_col = "initial_structure"
 id_col = "material_id"
 device = "cuda" if torch.cuda.is_available() else "cpu"
diff --git a/models/alignn_ff/alignn_ff_relax.py b/models/alignn_ff/alignn_ff_relax.py
@@ -9,8 +9,9 @@
 from pymatgen.io.jarvis import JarvisAtomsAdaptor
 from tqdm import tqdm
 
-from matbench_discovery import DEBUG, today
+from matbench_discovery import today
 from matbench_discovery.data import DATA_FILES, df_wbm
+from matbench_discovery.preds import e_form_col as target_col
 
 __author__ = "Janosh Riebesell, Philipp Benner"
 __date__ = "2023-07-11"
@@ -28,10 +29,9 @@
 # model_name = "mp_e_form_alignn"  # pre-trained by NIST
 model_name = f"{out_dir}/best-model.pth"
 task_type = "IS2RE"
-target_col = "e_form_per_atom_mp2020_corrected"
 input_col = "initial_structure"
 id_col = "material_id"
-job_name = f"{model_name}-wbm-{task_type}{'-debug' if DEBUG else ''}"
+job_name = f"{model_name}-wbm-{task_type}"
 out_path = (
     f"{out_dir}/{'alignn-relaxed-structs' if batch == 0 else f'{batch=}'}.json.gz"
 )
diff --git a/models/alignn_ff/test_alignn_ff.py b/models/alignn_ff/test_alignn_ff.py
@@ -18,9 +18,10 @@
 from sklearn.metrics import r2_score
 from tqdm import tqdm
 
-from matbench_discovery import DEBUG, today
+from matbench_discovery import today
 from matbench_discovery.data import DATA_FILES, df_wbm
 from matbench_discovery.plots import wandb_scatter
+from matbench_discovery.preds import e_form_col as target_col
 
 __author__ = "Philipp Benner, Janosh Riebesell"
 __date__ = "2023-07-11"
@@ -32,12 +33,11 @@
 n_splits = 100
 # model_name = "mp_e_form_alignnn"  # pre-trained by NIST
 task_type = "IS2RE"
-target_col = "e_form_per_atom_mp2020_corrected"
 input_col = "initial_structure"
 id_col = "material_id"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model_name = f"alignn-ff-wbm-{task_type}"
-job_name = f"{model_name}-relaxed-wbm-{task_type}{'-debug' if DEBUG else ''}"
+job_name = f"{model_name}-relaxed-wbm-{task_type}"
 out_dir = os.getenv("SBATCH_OUTPUT", f"{module_dir}/{today}-{job_name}")
 in_dir = os.getenv("SBATCH_OUTPUT", f"{module_dir}/{today}-{job_name}")
 
diff --git a/models/cgcnn/test_cgcnn.py b/models/cgcnn/test_cgcnn.py
@@ -16,6 +16,7 @@
 from matbench_discovery import CHECKPOINT_DIR, ROOT, WANDB_PATH, today
 from matbench_discovery.data import DATA_FILES, df_wbm
 from matbench_discovery.plots import wandb_scatter
+from matbench_discovery.preds import e_form_col as target_col
 from matbench_discovery.slurm import slurm_submit
 
 __author__ = "Janosh Riebesell"
@@ -53,8 +54,7 @@
 
 df = pd.read_json(data_path).set_index("material_id")
 
-e_form_col = "e_form_per_atom_mp2020_corrected"
-df[e_form_col] = df_wbm[e_form_col]
+df[target_col] = df_wbm[target_col]
 if task_type == "RS2RE":
     df[input_col] = [x["structure"] for x in df.computed_structure_entry]
 assert input_col in df, f"{input_col=} not in {list(df)}"
@@ -87,7 +87,7 @@
     versions={dep: version(dep) for dep in ("aviary", "numpy", "torch")},
     ensemble_size=len(runs),
     task_type=task_type,
-    target_col=e_form_col,
+    target_col=target_col,
     input_col=input_col,
     wandb_run_filters=filters,
     slurm_vars=slurm_vars,
@@ -97,7 +97,7 @@
 wandb.init(project="matbench-discovery", name=job_name, config=run_params)
 
 cg_data = CrystalGraphData(
-    df, task_dict={e_form_col: "regression"}, structure_col=input_col
+    df, task_dict={target_col: "regression"}, structure_col=input_col
 )
 data_loader = DataLoader(
     cg_data, batch_size=1024, shuffle=False, collate_fn=collate_batch
@@ -110,16 +110,16 @@
     # dropping isolated-atom structs means len(cg_data.df) < len(df)
     cache_dir=CHECKPOINT_DIR,
     df=cg_data.df.drop(columns=input_col),
-    target_col=e_form_col,
+    target_col=target_col,
     model_cls=CrystalGraphConvNet,
     data_loader=data_loader,
 )
 
 slurm_job_id = os.getenv("SLURM_JOB_ID", "debug")
 df.round(4).to_csv(f"{out_dir}/{job_name}-preds-{slurm_job_id}.csv.gz")
-pred_col = f"{e_form_col}_pred_ens"
+pred_col = f"{target_col}_pred_ens"
 assert pred_col in df, f"{pred_col=} not in {list(df)}"
-table = wandb.Table(dataframe=df[[e_form_col, pred_col]].reset_index())
+table = wandb.Table(dataframe=df[[target_col, pred_col]].reset_index())
 
 
 # %%
@@ -128,4 +128,4 @@
 
 title = f"CGCNN {task_type} ensemble={len(runs)} {MAE=:.4} {R2=:.4}"
 
-wandb_scatter(table, fields=dict(x=e_form_col, y=pred_col), title=title)
+wandb_scatter(table, fields=dict(x=target_col, y=pred_col), title=title)
diff --git a/models/mace/analyze_mace.py b/models/mace/analyze_mace.py
@@ -6,17 +6,18 @@
 
 import pandas as pd
 from pymatviz import density_scatter, ptable_heatmap_plotly, spacegroup_sunburst
+from pymatviz.utils import save_fig
 
 from matbench_discovery import plots as plots
 from matbench_discovery.data import df_wbm
 from matbench_discovery.preds import PRED_FILES
+from matbench_discovery.preds import e_form_col as target_col
 
 __author__ = "Janosh Riebesell"
 __date__ = "2023-07-23"
 
 module_dir = os.path.dirname(__file__)
 id_col = "material_id"
-target_col = "e_form_per_atom_mp2020_corrected"
 pred_col = "e_form_per_atom_mace"
 
 
@@ -29,29 +30,36 @@
 
 
 # %%
-density_scatter(df=df_mace, x=target_col, y=pred_col)
+ax = density_scatter(df=df_mace, x=target_col, y=pred_col)
+ax.set(title=f"{len(df_mace):,} MACE severe energy underpredictions")
+save_fig(ax, "mace-hull-dist-scatter.pdf")
 
 
 # %%
-df_bad = df_mace.query(f"{target_col} - {pred_col} > 2")
+df_low = df_mace.query(f"{target_col} - {pred_col} > 2")
 
-ax = density_scatter(df=df_bad, x=target_col, y=pred_col)
-ax.set(title=f"{len(df_bad):,} MACE severe energy underpredictions")
+ax = density_scatter(df=df_low, x=target_col, y=pred_col)
+ax.set(title=f"{len(df_low):,} MACE severe energy underpredictions")
+save_fig(ax, "mace-too-low-hull-dist-scatter.pdf")
 
 
 # %%
-fig = ptable_heatmap_plotly(df_bad.formula)
-title = f"Elements in {len(df_bad):,} MACE severe energy underpredictions"
+fig = ptable_heatmap_plotly(df_low.formula)
+title = f"Elements in {len(df_low):,} MACE severe energy underpredictions"
 fig.layout.title.update(text=title, x=0.4, y=0.95)
 fig.show()
 
+save_fig(fig, "mace-too-low-elements-heatmap.pdf")
+
 
 # %%
-fig = spacegroup_sunburst(df_bad[spg_col], title="MACE spacegroups")
-title = f"Spacegroup sunburst of {len(df_bad):,} MACE severe energy underpredictions"
+fig = spacegroup_sunburst(df_low[spg_col], title="MACE spacegroups")
+title = f"Spacegroup sunburst of {len(df_low):,} MACE severe energy underpredictions"
 fig.layout.title.update(text=title, x=0.5)
 fig.show()
 
+save_fig(fig, "mace-too-low-spacegroup-sunburst.pdf")
+
 
 """
 Space groups of MACE underpredictions look unremarkable but unusually heavy in Silicon,
diff --git a/models/megnet/test_megnet.py b/models/megnet/test_megnet.py
@@ -23,7 +23,7 @@
 from matbench_discovery import timestamp, today
 from matbench_discovery.data import DATA_FILES, df_wbm
 from matbench_discovery.plots import wandb_scatter
-from matbench_discovery.preds import PRED_FILES
+from matbench_discovery.preds import PRED_FILES, e_form_col
 from matbench_discovery.slurm import slurm_submit
 
 __author__ = "Janosh Riebesell"
@@ -63,7 +63,6 @@
 }[task_type]
 print(f"\nJob started running {timestamp}")
 print(f"{data_path=}")
-e_form_col = "e_form_per_atom_mp2020_corrected"
 assert e_form_col in df_wbm, f"{e_form_col=} not in {list(df_wbm)=}"
 
 df_in: pd.DataFrame = np.array_split(
diff --git a/models/voronoi/train_test_voronoi_rf.py b/models/voronoi/train_test_voronoi_rf.py
@@ -15,6 +15,7 @@
 from matbench_discovery import today
 from matbench_discovery.data import DATA_FILES, df_wbm, glob_to_df
 from matbench_discovery.plots import wandb_scatter
+from matbench_discovery.preds import e_form_col as test_e_form_col
 from matbench_discovery.slurm import slurm_submit
 from models.voronoi import featurizer
 
@@ -55,8 +56,6 @@
 df_test = pd.read_csv(test_path).set_index("material_id")
 print(f"{df_test.shape=}")
 
-test_e_form_col = "e_form_per_atom_mp2020_corrected"
-
 
 for df, df_tar, col in (
     (df_train, df_mp, train_e_form_col),
diff --git a/models/wrenformer/test_wrenformer.py b/models/wrenformer/test_wrenformer.py
@@ -20,6 +20,7 @@
 from matbench_discovery import CHECKPOINT_DIR, WANDB_PATH, today
 from matbench_discovery.data import DATA_FILES
 from matbench_discovery.plots import wandb_scatter
+from matbench_discovery.preds import e_form_col
 from matbench_discovery.slurm import slurm_submit
 
 __author__ = "Janosh Riebesell"
@@ -44,7 +45,6 @@
 
 
 # %%
-e_form_col = "e_form_per_atom_mp2020_corrected"
 input_col = "wyckoff_spglib"
 df = pd.read_csv(data_path).dropna(subset=input_col).set_index("material_id")
 
diff --git a/scripts/model_figs/hist_classified_stable_vs_hull_dist_models.py b/scripts/model_figs/hist_classified_stable_vs_hull_dist_models.py
@@ -12,7 +12,13 @@
 
 from matbench_discovery import FIGS, PDF_FIGS, today
 from matbench_discovery.plots import hist_classified_stable_vs_hull_dist, plt
-from matbench_discovery.preds import df_metrics, df_preds, e_form_col, each_true_col
+from matbench_discovery.preds import (
+    df_metrics,
+    df_preds,
+    e_form_col,
+    each_pred_col,
+    each_true_col,
+)
 
 __author__ = "Janosh Riebesell"
 __date__ = "2022-12-01"
@@ -21,7 +27,6 @@
 # %%
 hover_cols = (df_preds.index.name, e_form_col, each_true_col, "formula")
 e_form_preds = "e_form_per_atom_pred"
-each_pred_col = "e_above_hull_pred"
 facet_col = "Model"
 # sort facet plots by model's F1 scores (optionally only show top n=6)
 models = list(df_metrics.T.F1.sort_values().index)[::-1]
diff --git a/scripts/model_figs/per_element_errors.py b/scripts/model_figs/per_element_errors.py
@@ -147,7 +147,7 @@
 # %%
 expected_cols = {
     *"ALIGNN, BOWSR, CGCNN, CGCNN+P, CHGNet, M3GNet, MEGNet, "
-    f"{train_count_col}, Mean error all models, {test_set_std_col}, Voronoi RF, "
+    f"{train_count_col}, {model_mean_err_col}, {test_set_std_col}, Voronoi RF, "
     "Wrenformer".split(", ")
 }
 assert {*df_elem_err} >= expected_cols
diff --git a/tests/test_plots.py b/tests/test_plots.py
@@ -17,14 +17,16 @@
     hist_classified_stable_vs_hull_dist,
     rolling_mae_vs_hull_dist,
 )
-from matbench_discovery.preds import load_df_wbm_with_preds
+from matbench_discovery.preds import (
+    e_form_col,
+    each_pred_col,
+    each_true_col,
+    load_df_wbm_with_preds,
+)
 
 AxLine = Literal["x", "y", "xy", ""]
 models = ["MEGNet", "CGCNN", "Voronoi RF"]
 df_wbm = load_df_wbm_with_preds(models, nrows=100)
-each_true_col = "e_above_hull_mp2020_corrected_ppd_mp"
-each_pred_col = "e_above_hull_pred"
-e_form_col = "e_form_per_atom_mp2020_corrected"
 
 
 @pytest.mark.parametrize(

Original file line number	Diff line number	Diff line change
`@@ -147,7 +147,7 @@`
`147`	`147`	`# %%`
`148`	`148`	`expected_cols = {`
`149`	`149`	`*"ALIGNN, BOWSR, CGCNN, CGCNN+P, CHGNet, M3GNet, MEGNet, "`
`150`		`- f"{train_count_col}, Mean error all models, {test_set_std_col}, Voronoi RF, "`
	`150`	`+ f"{train_count_col}, {model_mean_err_col}, {test_set_std_col}, Voronoi RF, "`
`151`	`151`	`"Wrenformer".split(", ")`
`152`	`152`	`}`
`153`	`153`	`assert {*df_elem_err} >= expected_cols`