janosh
diff --git a/‎matbench_discovery/plots.py
+119-102 b/‎matbench_discovery/plots.py
+119-102
diff --git a/‎matbench_discovery/preds.py
+18-11 b/‎matbench_discovery/preds.py
+18-11
diff --git a/‎models/megnet/test_megnet.py
+2-2 b/‎models/megnet/test_megnet.py
+2-2
diff --git a/‎models/voronoi/voronoi_featurize_dataset.py
+1-1 b/‎models/voronoi/voronoi_featurize_dataset.py
+1-1
diff --git a/‎scripts/compute_struct_fingerprints.py
+98 b/‎scripts/compute_struct_fingerprints.py
+98
diff --git a/‎scripts/hist_classified_stable_vs_hull_dist.py
+14-31 b/‎scripts/hist_classified_stable_vs_hull_dist.py
+14-31
diff --git a/‎scripts/hist_classified_stable_vs_hull_dist_models.py
+23-7 b/‎scripts/hist_classified_stable_vs_hull_dist_models.py
+23-7
@@ -29,23 +29,29 @@ class PredFiles(Files):
     _root = f"{ROOT}/models/"
     _key_map = model_labels  # remap model keys below to pretty plot labels (see Files)
 
-    # bowsr optimizer coupled with original megnet
+    # BOWSR optimizer coupled with original megnet
     bowsr_megnet = "bowsr/2023-01-23-bowsr-megnet-wbm-IS2RE.csv"
     # default CHGNet model from publication with 400,438 params
     chgnet = "chgnet/2023-03-06-chgnet-wbm-IS2RE.csv"
-    chgnet_megnet = "chgnet/2023-03-04-chgnet-wbm-IS2RE.csv"
+    # CHGNet-relaxed structures fed into MEGNet for formation energy prediction
+    # chgnet_megnet = "chgnet/2023-03-04-chgnet-wbm-IS2RE.csv"
+
     # CGCnn 10-member ensemble
     cgcnn = "cgcnn/2023-01-26-test-cgcnn-wbm-IS2RE/cgcnn-ensemble-preds.csv"
-    # cgcnn 10-member ensemble with 5-fold training set perturbations
+    # CGCnn 10-member ensemble with 5-fold training set perturbations
     cgcnn_p = "cgcnn/2023-02-05-cgcnn-perturb=5.csv"
-    # original m3gnet straight from publication, not re-trained
+
+    # original M3GNet straight from publication, not re-trained
     m3gnet = "m3gnet/2022-10-31-m3gnet-wbm-IS2RE.csv"
-    # m3gnet-relaxed structures fed into megnet for formation energy prediction
-    m3gnet_megnet = "m3gnet/2022-10-31-m3gnet-wbm-IS2RE.csv"
-    # original megnet straight from publication, not re-trained
+    # M3GNet-relaxed structures fed into MEGNet for formation energy prediction
+    # m3gnet_megnet = "m3gnet/2022-10-31-m3gnet-wbm-IS2RE.csv"
+
+    # original MEGNet straight from publication, not re-trained
     megnet = "megnet/2022-11-18-megnet-wbm-IS2RE/megnet-e-form-preds.csv"
-    # magpie composition+voronoi tessellation structure features + sklearn random forest
+
+    # Magpie composition+Voronoi tessellation structure features + sklearn random forest
     voronoi_rf = "voronoi/2022-11-27-train-test/e-form-preds-IS2RE.csv"
+
     # wrenformer 10-member ensemble
     wrenformer = "wrenformer/2022-11-15-wrenformer-IS2RE-preds.csv"
 
@@ -113,13 +119,14 @@ def load_df_wbm_with_preds(
 
 # load WBM summary dataframe with all models' formation energy predictions (eV/atom)
 df_preds = load_df_wbm_with_preds().round(3)
-for combo in [["CHGNet", "M3GNet"]]:
-    df_preds[" + ".join(combo)] = df_preds[combo].mean(axis=1)
+# for combo in [["CHGNet", "M3GNet"]]:
+#     df_preds[" + ".join(combo)] = df_preds[combo].mean(axis=1)
+#     PRED_FILES[" + ".join(combo)] = "combo"
 
 
 df_metrics = pd.DataFrame()
 df_metrics.index.name = "model"
-for model in [*PRED_FILES, "CHGNet + M3GNet"]:
+for model in PRED_FILES:
     df_metrics[model] = stable_metrics(
         df_preds[each_true_col],
         df_preds[each_true_col] + df_preds[model] - df_preds[e_form_col],
 
@@ -58,8 +58,8 @@
 data_path = {
     "IS2RE": DATA_FILES.wbm_initial_structures,
     "RS2RE": DATA_FILES.wbm_computed_structure_entries,
-    "chgnet_structure": PRED_FILES.CHGNet.replace(".csv", ".json.gz"),
-    "m3gnet_structure": PRED_FILES.M3GNet.replace(".csv", ".json.gz"),
+    "chgnet_structure": PRED_FILES.__dict__["CHGNet"].replace(".csv", ".json.gz"),
+    "m3gnet_structure": PRED_FILES.__dict__["M3GNet"].replace(".csv", ".json.gz"),
 }[task_type]
 print(f"\nJob started running {timestamp}")
 print(f"{data_path=}")
 
@@ -24,7 +24,7 @@
 __date__ = "2022-10-31"
 
 
-data_name = "mp"  # "mp"
+data_name = "mp"
 data_path = {
     "wbm": DATA_FILES.wbm_initial_structures,
     "mp": DATA_FILES.mp_computed_structure_entries,
 
@@ -0,0 +1,98 @@
+"""Analyze structures and composition with largest mean error across all models.
+Maybe there's some chemistry/region of materials space that all models struggle with?
+Might point to deficiencies in the data or models architecture.
+"""
+
+
+# %%
+import os
+import warnings
+
+import numpy as np
+import pandas as pd
+from matminer.featurizers.site import CrystalNNFingerprint
+from matminer.featurizers.structure import SiteStatsFingerprint
+from pymatgen.core import Structure
+from tqdm import tqdm
+
+from matbench_discovery import ROOT, timestamp
+from matbench_discovery.data import DATA_FILES
+from matbench_discovery.slurm import slurm_submit
+
+__author__ = "Janosh Riebesell"
+__date__ = "2023-03-26"
+
+warnings.filterwarnings(action="ignore", category=UserWarning, module="pymatgen")
+
+
+# %% compute all initial and final MP/WBM structure fingerprints
+data_name = "wbm"
+data_path = {
+    "wbm": DATA_FILES.wbm_cses_plus_init_structs,
+    "mp": DATA_FILES.mp_computed_structure_entries,
+}[data_name]
+
+slurm_array_task_id = int(os.getenv("SLURM_ARRAY_TASK_ID", 0))
+slurm_array_task_count = 100
+
+job_name = f"make-{data_name}-struct-fingerprints"
+out_dir = f"{ROOT}/data/{data_name}/structure-fingerprints"
+os.makedirs(out_dir, exist_ok=True)
+
+slurm_vars = slurm_submit(
+    job_name=job_name,
+    out_dir=out_dir,
+    partition="icelake-himem",
+    account="LEE-SL3-CPU",
+    time="6:0:0",
+    array=f"1-{slurm_array_task_count}",
+)
+
+
+# %%
+out_path = f"{out_dir}/site-stats-{slurm_array_task_id}.json.gz"
+if os.path.isfile(out_path):
+    raise SystemExit(f"{out_path = } already exists, exciting early")
+
+print(f"\nJob started running {timestamp}")
+print(f"{out_path=}")
+
+
+# %%
+df_in: pd.DataFrame = np.array_split(
+    pd.read_json(data_path).set_index("material_id"), slurm_array_task_count
+)[slurm_array_task_id - 1]
+
+cnn_fp = CrystalNNFingerprint.from_preset("ops")
+# including "minimum" and "maximum" increases the fingerprint length from 61 to 122
+site_stats_fp = SiteStatsFingerprint(
+    cnn_fp, stats=("mean", "std_dev", "minimum", "maximum")
+)
+
+
+# %%
+init_struct_col = "initial_structure"
+final_struct_col = "computed_structure_entry"
+init_fp_col = "initial_site_stats_fingerprint"
+final_fp_col = "final_site_stats_fingerprint"
+for struct_col, fp_col in (
+    (init_struct_col, init_fp_col),
+    (final_struct_col, final_fp_col),
+    ("entry", final_fp_col),
+):
+    if struct_col not in df_in:
+        continue
+    df_in[fp_col] = None
+
+    for row in tqdm(df_in.itertuples(), total=len(df_in)):
+        struct = getattr(row, struct_col)
+        if "structure" in struct:  # is a ComputedStructureEntry as dict
+            struct = struct["structure"]
+        struct = Structure.from_dict(struct)
+        try:
+            ss_fp = site_stats_fp.featurize(struct)
+            df_in.at[row.Index, fp_col] = ss_fp
+        except Exception as exc:
+            print(f"{fp_col} for {row.Index} failed: {exc}")
+
+df_in.filter(like="site_stats_fingerprint").to_json(out_path)
@@ -12,56 +12,39 @@
 from pymatviz.utils import save_fig
 
 from matbench_discovery import FIGS
-from matbench_discovery.metrics import stable_metrics
+from matbench_discovery.data import df_wbm
 from matbench_discovery.plots import hist_classified_stable_vs_hull_dist
-from matbench_discovery.preds import df_preds, e_form_col, each_pred_col, each_true_col
+from matbench_discovery.preds import df_each_pred, each_true_col
 
 __author__ = "Rhys Goodall, Janosh Riebesell"
 __date__ = "2022-06-18"
 
 
 # %%
 model_name = "Wrenformer"
+model_name = "CHGNet"
+# model_name = "M3GNet"
+# model_name = "Voronoi RF"
 which_energy: Final = "true"
-# std_factor=0,+/-1,+/-2,... changes the criterion for material stability to
-# energy+std_factor*std. energy+std means predicted energy plus the model's uncertainty
-# in the prediction have to be on or below the convex hull to be considered stable. This
-# reduces the false positive rate, but increases the false negative rate. Vice versa for
-# energy-std. energy+std should be used for cautious exploration, energy-std for
-# exhaustive exploration.
-std_factor = 0
-
-# TODO column names to compute standard deviation from are currently hardcoded
-# needs to be updated when adding non-aviary models with uncertainty estimation
-var_aleatoric = (df_preds.filter(like="_ale_") ** 2).mean(axis=1)
-var_epistemic = df_preds.filter(regex=r"_pred_\d").var(axis=1, ddof=0)
-std_total = (var_epistemic + var_aleatoric) ** 0.5
-std_total = df_preds[f"{model_name}_std"]
-df_preds[each_pred_col] = df_preds[each_true_col] + (
-    (df_preds[model_name] + std_factor * std_total) - df_preds[e_form_col]
-)
+df_each_pred[each_true_col] = df_wbm[each_true_col]
+backend: Final = "plotly"
 
 fig = hist_classified_stable_vs_hull_dist(
-    df_preds,
+    df_each_pred,
     each_true_col=each_true_col,
-    each_pred_col=each_pred_col,
+    each_pred_col=model_name,
     which_energy=which_energy,
     # stability_threshold=-0.05,
-    # rolling_acc=0,
-    backend="plotly",
+    # rolling_acc=None,
+    backend=backend,
 )
 
-metrics = stable_metrics(df_preds[each_true_col], df_preds[each_pred_col])
-legend_title = f"DAF = {metrics['DAF']:.3}"
-
-if hasattr(fig, "legend"):  # matplotlib
-    fig.legend(loc="upper left", frameon=False, title=legend_title)
-else:  # plotly
-    fig.layout.legend.title.text = legend_title
+if backend == "plotly":
+    fig.layout.title = model_name
     fig.show()
 
 
 # %%
-img_path = f"{FIGS}/wren-wbm-hull-dist-hist-{which_energy=}"
+img_path = f"{FIGS}/hist-clf-{which_energy}-hull-dist-{model_name}"
 # save_fig(fig, f"{img_path}.svelte")
 save_fig(fig, f"{img_path}.webp")
@@ -9,8 +9,11 @@
 
 from pymatviz.utils import save_fig
 
-from matbench_discovery import ROOT, STATIC, today
-from matbench_discovery.plots import hist_classified_stable_vs_hull_dist, plt
+from matbench_discovery import FIGS, ROOT, today
+from matbench_discovery.plots import (
+    hist_classified_stable_vs_hull_dist,
+    plt,
+)
 from matbench_discovery.preds import df_metrics, df_preds, e_form_col, each_true_col
 
 __author__ = "Janosh Riebesell"
@@ -58,6 +61,18 @@
     **kwds,  # type: ignore[arg-type]
 )
 
+# true_pos, false_neg, false_pos, true_neg = classify_stable(
+#     df_melt[each_true_col], df_melt[each_pred_col], stability_threshold=0
+# )
+# import numpy as np
+
+# df_melt[(clf_col := "classified")] = np.array(clf_labels)[
+#     true_pos * 0 + false_neg * 1 + false_pos * 2 + true_neg * 3
+# ]
+# import pandas as pd
+
+# pd.cut(df_melt[each_pred_col], bins=10).value_counts()
+
 
 # TODO add line showing the true hull distance histogram on each subplot
 show_metrics = False
@@ -91,11 +106,12 @@
         )
         anno.text = f"{model_name} · {F1=:.2f} · {FPR=:.2f} · {FNR=:.2f} · {DAF=:.2f}"
 
+    fig.layout.height = 1000
     fig.layout.margin.update(t=50, b=30, l=40, r=0)
     fig.layout.legend.update(
-        y=1.15, xanchor="center", x=0.5, bgcolor="rgba(0,0,0,0)", orientation="h"
+        y=1.1, xanchor="center", x=0.5, bgcolor="rgba(0,0,0,0)", orientation="h"
     )
-    fig.update_yaxes(range=[0, 3_000], title_text=None)
+    fig.update_yaxes(range=[0, 11_000], title_text=None)
 
     # for trace in fig.data:
     #     # no need to store all 250k x values in plot, leads to 1.7 MB file,
@@ -107,8 +123,8 @@
 
 
 # %%
-img_name = f"hist-{which_energy}-energy-vs-hull-dist-models"
-# save_fig(fig, f"{FIGS}/{img_name}.svelte")
+img_name = f"hist-clf-{which_energy}-hull-dist-models"
+save_fig(fig, f"{FIGS}/{img_name}.svelte")
 n_models = len(fig.layout.annotations)
-save_fig(fig, f"{STATIC}/{img_name}.webp", scale=3, height=100 * n_models)
+# save_fig(fig, f"{STATIC}/{img_name}.webp", scale=3, height=100 * n_models)
 save_fig(fig, f"{ROOT}/tmp/figures/{img_name}.pdf", height=550, width=600)