janosh
diff --git a/‎matbench_discovery/metrics.py
+10-4 b/‎matbench_discovery/metrics.py
+10-4
diff --git a/‎matbench_discovery/plots.py
+2 b/‎matbench_discovery/plots.py
+2
diff --git a/‎matbench_discovery/preds.py
+3-3 b/‎matbench_discovery/preds.py
+3-3
diff --git a/‎models/bowsr/test_bowsr.py
+3-7 b/‎models/bowsr/test_bowsr.py
+3-7
diff --git a/‎models/chgnet/join_chgnet_results.py
+2-43 b/‎models/chgnet/join_chgnet_results.py
+2-43
diff --git a/‎models/chgnet/metadata.yml
+1-1 b/‎models/chgnet/metadata.yml
+1-1
diff --git a/‎models/chgnet/test_chgnet.py
+4-5 b/‎models/chgnet/test_chgnet.py
+4-5
diff --git a/‎models/m3gnet/join_m3gnet_results.py
+1-43 b/‎models/m3gnet/join_m3gnet_results.py
+1-43
diff --git a/‎models/m3gnet/test_m3gnet.py
+3-4 b/‎models/m3gnet/test_m3gnet.py
+3-4
diff --git a/‎models/megnet/test_megnet.py
+15-4 b/‎models/megnet/test_megnet.py
+15-4
diff --git a/‎models/voronoi/voronoi_featurize_dataset.py
+3-4 b/‎models/voronoi/voronoi_featurize_dataset.py
+3-4
diff --git a/‎scripts/compile_metrics.py
+5-4 b/‎scripts/compile_metrics.py
+5-4
diff --git a/‎scripts/cumulative_clf_metrics.py
+1-1 b/‎scripts/cumulative_clf_metrics.py
+1-1
@@ -86,17 +86,23 @@ def stable_metrics(
     is_nan = np.isnan(each_true) | np.isnan(each_pred)
     each_true, each_pred = np.array(each_true)[~is_nan], np.array(each_pred)[~is_nan]
 
+    TPR = recall
+    FPR = n_false_pos / n_total_neg
+    TNR = n_true_neg / n_total_neg
+    FNR = n_false_neg / n_total_pos
+    # sanity check: false positives + true negatives = all negatives
+    assert FPR + TNR == 1
+    # sanity check: true positives + false negatives = all positives
+    assert TPR + FNR == 1
+
     return dict(
         F1=2 * (precision * recall) / (precision + recall),
         R2=r2_score(each_true, each_pred),
         DAF=precision / prevalence,
         Precision=precision,
         Recall=recall,
+        **dict(TPR=TPR, FPR=FPR, TNR=TNR, FNR=FNR),
         Accuracy=(n_true_pos + n_true_neg) / len(each_true),
-        TPR=n_true_pos / n_total_pos,
-        FPR=n_false_pos / n_total_neg,
-        TNR=n_true_neg / n_total_neg,
-        FNR=n_false_neg / n_total_pos,
         MAE=np.abs(each_true - each_pred).mean(),
         RMSE=((each_true - each_pred) ** 2).mean() ** 0.5,
     )
@@ -486,6 +486,8 @@ def rolling_mae_vs_hull_dist(
             yanchor="bottom",
             title_font=dict(size=15),
         )
+        # change tooltip precision to 2 decimal places
+        ax.update_traces(hovertemplate="x = %{x:.2f} eV/atom<br>y = %{y:.2f} eV/atom")
         ax.layout.xaxis.title.text = "E<sub>above MP hull</sub> (eV/atom)"
         ax.layout.yaxis.title.text = "rolling MAE (eV/atom)"
         ax.update_xaxes(range=x_lim)
 
@@ -32,16 +32,16 @@ class PredFiles(Files):
     # bowsr optimizer coupled with original megnet
     bowsr_megnet = "bowsr/2023-01-23-bowsr-megnet-wbm-IS2RE.csv"
     # default CHGNet model from publication with 400,438 params
-    chgnet = "chgnet/2023-03-04-chgnet-wbm-IS2RE.csv"
-    chgnet_megnet = "chgnet/2023-03-04-chgnet-wbm-IS2RE.csv"
+    chgnet = "chgnet/2023-03-06-chgnet-wbm-IS2RE.csv"
+    # chgnet_megnet = "chgnet/2023-03-04-chgnet-wbm-IS2RE.csv"
     # CGCnn 10-member ensemble
     cgcnn = "cgcnn/2023-01-26-test-cgcnn-wbm-IS2RE/cgcnn-ensemble-preds.csv"
     # cgcnn 10-member ensemble with 5-fold training set perturbations
     cgcnn_p = "cgcnn/2023-02-05-cgcnn-perturb=5.csv"
     # original m3gnet straight from publication, not re-trained
     m3gnet = "m3gnet/2022-10-31-m3gnet-wbm-IS2RE.csv"
     # m3gnet-relaxed structures fed into megnet for formation energy prediction
-    m3gnet_megnet = "m3gnet/2022-10-31-m3gnet-wbm-IS2RE.csv"
+    # m3gnet_megnet = "m3gnet/2022-10-31-m3gnet-wbm-IS2RE.csv"
     # original megnet straight from publication, not re-trained
     megnet = "megnet/2022-11-18-megnet-wbm-IS2RE/megnet-e-form-preds.csv"
     # magpie composition+voronoi tessellation structure features + sklearn random forest
 
@@ -72,13 +72,9 @@
 print(f"{data_path = }")
 print(f"{out_path = }")
 
-
-# %%
-df_wbm = pd.read_json(data_path).set_index("material_id")
-
-df_in: pd.DataFrame = np.array_split(df_wbm, slurm_array_task_count)[
-    slurm_array_task_id - 1
-]
+df_in: pd.DataFrame = np.array_split(
+    pd.read_json(data_path).set_index("material_id"), slurm_array_task_count
+)[slurm_array_task_id - 1]
 
 
 # %%
 
@@ -11,8 +11,6 @@
 from glob import glob
 
 import pandas as pd
-from megnet.utils.models import load_model
-from pymatgen.core import Structure
 from pymatviz import density_scatter
 from tqdm import tqdm
 
@@ -62,53 +60,14 @@
 
 
 # %%
-ax = density_scatter(x=df_wbm[e_form_col], y=df_wbm[e_form_chgnet_col])
-
-
-# %% load 2019 MEGNet formation energy model
-megnet_mp_e_form = load_model("Eform_MP_2019")
-megnet_e_form_preds: dict[str, float] = {}
-
-
-# %% predict formation energies on chgnet relaxed structure with MEGNet
-for material_id, struct in tqdm(
-    df_chgnet.chgnet_structure.items(), total=len(df_chgnet)
-):
-    if material_id in megnet_e_form_preds:
-        continue
-    try:
-        if isinstance(struct, dict):
-            struct = Structure.from_dict(struct)
-        [e_form_per_atom] = megnet_mp_e_form.predict_structure(struct)
-        megnet_e_form_preds[material_id] = e_form_per_atom
-    except Exception as exc:
-        print(f"Failed to predict {material_id=}: {exc}")
-
-e_form_megnet_col = "e_form_per_atom_chgnet_megnet"
-# remove legacy MP corrections that MEGNet was trained on and apply newer MP2020
-# corrections instead
-df_chgnet[e_form_megnet_col] = (
-    pd.Series(megnet_e_form_preds)
-    - df_wbm.e_correction_per_atom_mp_legacy
-    + df_wbm.e_correction_per_atom_mp2020
-)
-
-assert (
-    n_isna := df_chgnet.e_form_per_atom_chgnet_megnet.isna().sum()
-) < 10, f"too many missing MEGNet preds: {n_isna}"
-
-
-# %%
-ax = density_scatter(df=df_chgnet, x=e_form_chgnet_col, y=e_form_megnet_col)
-ax = density_scatter(df=df_chgnet, x=e_form_col, y=e_form_megnet_col)
+ax = density_scatter(df=df_wbm, x=e_form_col, y=e_form_chgnet_col)
 
 
 # %%
 out_path = f"{module_dir}/{today}-chgnet-wbm-{task_type}.json.gz"
 df_chgnet = df_chgnet.round(4)
-df_chgnet.reset_index().to_json(out_path, default_handler=as_dict_handler)
-
 df_chgnet.select_dtypes("number").to_csv(out_path.replace(".json.gz", ".csv"))
+df_chgnet.reset_index().to_json(out_path, default_handler=as_dict_handler)
 
 # in_path = f"{module_dir}/2023-03-04-chgnet-wbm-IS2RE.json.gz"
 # df_chgnet = pd.read_csv(in_path.replace(".json.gz", ".csv")).set_index("material_id")
 
@@ -1,4 +1,4 @@
-model_name: [CHGNet, CHGNet + MEGNet]
+model_name: CHGNet
 model_version: 0.0.1
 matbench_discovery_version: 1.0
 date_added: "2023-03-03"
 
@@ -61,13 +61,12 @@
 }[task_type]
 print(f"\nJob started running {timestamp}")
 print(f"{data_path=}")
-df_in = pd.read_json(data_path).set_index("material_id")
 e_pred_col = "chgnet_energy"
 max_steps = 2000
 
-df_in: pd.DataFrame = np.array_split(df_in, slurm_array_task_count)[
-    slurm_array_task_id - 1
-]
+df_in: pd.DataFrame = np.array_split(
+    pd.read_json(data_path).set_index("material_id"), slurm_array_task_count
+)[slurm_array_task_id - 1]
 
 run_params = dict(
     data_path=data_path,
@@ -124,7 +123,7 @@
     ].reset_index()
 )
 
-title = f"CHGNet {task_type} ({len(df_wbm):,})"
+title = f"CHGNet {task_type} ({len(df_out):,})"
 wandb_scatter(table, fields=dict(x="uncorrected_energy", y=e_pred_col), title=title)
 
 wandb.log_artifact(out_path, type=f"chgnet-wbm-{task_type}")
@@ -11,15 +11,14 @@
 from glob import glob
 
 import pandas as pd
-from megnet.utils.models import load_model
 from pymatgen.core import Structure
 from pymatgen.entries.compatibility import MaterialsProject2020Compatibility
 from pymatgen.entries.computed_entries import ComputedStructureEntry
 from pymatviz import density_scatter
 from tqdm import tqdm
 
 from matbench_discovery import today
-from matbench_discovery.data import DATA_FILES, as_dict_handler, df_wbm
+from matbench_discovery.data import DATA_FILES, as_dict_handler
 from matbench_discovery.energy import get_e_form_per_atom
 
 __author__ = "Janosh Riebesell"
@@ -93,47 +92,6 @@
 )
 
 
-# %% load 2019 MEGNet formation energy model
-megnet_mp_e_form = load_model("Eform_MP_2019")
-megnet_e_form_preds: dict[str, float] = {}
-
-
-# %% predict formation energies on M3GNet relaxed structure with MEGNet
-for material_id, struct in tqdm(
-    df_m3gnet.m3gnet_structure.items(), total=len(df_m3gnet)
-):
-    if material_id in megnet_e_form_preds:
-        continue
-    try:
-        if isinstance(struct, dict):
-            struct = struct = Structure.from_dict(struct)
-            df_m3gnet.loc[material_id, struct_col] = struct
-
-        [e_form_per_atom] = megnet_mp_e_form.predict_structure(struct)
-        megnet_e_form_preds[material_id] = e_form_per_atom
-    except Exception as exc:
-        print(f"Failed to predict {material_id=}: {exc}")
-
-pred_col_megnet = "e_form_per_atom_m3gnet_megnet"
-# remove legacy MP corrections that MEGNet was trained on and apply newer MP2020
-# corrections instead
-df_m3gnet[pred_col_megnet] = (
-    pd.Series(megnet_e_form_preds)
-    - df_wbm.e_correction_per_atom_mp_legacy
-    + df_wbm.e_correction_per_atom_mp2020
-)
-
-assert (
-    n_isna := df_m3gnet.e_form_per_atom_m3gnet_megnet.isna().sum()
-) < 10, f"too many missing MEGNet preds: {n_isna}"
-
-
-# %%
-ax = density_scatter(
-    df=df_m3gnet, x="e_form_per_atom_m3gnet", y="e_form_per_atom_m3gnet_megnet"
-)
-
-
 # %%
 out_path = f"{module_dir}/{today}-m3gnet-wbm-{task_type}.json.gz"
 df_m3gnet = df_m3gnet.round(4)
 
@@ -66,12 +66,11 @@
 }[task_type]
 print(f"\nJob started running {timestamp}")
 print(f"{data_path=}")
-df_wbm = pd.read_json(data_path).set_index("material_id")
 e_pred_col = "m3gnet_energy"
 
-df_in: pd.DataFrame = np.array_split(df_wbm, slurm_array_task_count)[
-    slurm_array_task_id - 1
-]
+df_in: pd.DataFrame = np.array_split(
+    pd.read_json(data_path).set_index("material_id"), slurm_array_task_count
+)[slurm_array_task_id - 1]
 
 run_params = dict(
     data_path=data_path,
 
@@ -11,6 +11,7 @@
 import os
 from importlib.metadata import version
 
+import numpy as np
 import pandas as pd
 import wandb
 from megnet.utils.models import load_model
@@ -21,15 +22,17 @@
 from matbench_discovery import DEBUG, timestamp, today
 from matbench_discovery.data import DATA_FILES, df_wbm
 from matbench_discovery.plots import wandb_scatter
+from matbench_discovery.preds import PRED_FILES
 from matbench_discovery.slurm import slurm_submit
 
 __author__ = "Janosh Riebesell"
 __date__ = "2022-11-14"
 
-task_type = "IS2RE"
+task_type = "chgnet_structure"
 module_dir = os.path.dirname(__file__)
 job_name = f"megnet-wbm-{task_type}{'-debug' if DEBUG else ''}"
 out_dir = os.environ.get("SBATCH_OUTPUT", f"{module_dir}/{today}-{job_name}")
+slurm_array_task_count = 20
 
 slurm_vars = slurm_submit(
     job_name=job_name,
@@ -38,27 +41,33 @@
     account="LEE-SL3-CPU",
     time="12:0:0",
     slurm_flags=("--mem", "30G"),
+    array=f"1-{slurm_array_task_count}",
     # TF_CPP_MIN_LOG_LEVEL=2 means INFO and WARNING logs are not printed
     # https://stackoverflow.com/a/40982782
     pre_cmd="TF_CPP_MIN_LOG_LEVEL=2",
 )
 
 
 # %%
+slurm_array_task_id = int(os.environ.get("SLURM_ARRAY_TASK_ID", 0))
 out_path = f"{out_dir}/megnet-e-form-preds.csv"
 if os.path.isfile(out_path):
     raise SystemExit(f"{out_path = } already exists, exciting early")
 
 data_path = {
     "IS2RE": DATA_FILES.wbm_initial_structures,
     "RS2RE": DATA_FILES.wbm_computed_structure_entries,
+    "chgnet_structure": PRED_FILES.__dict__["CHGNet"].replace(".csv", ".json.gz"),
+    "m3gnet_structure": PRED_FILES.__dict__["M3GNet"].replace(".csv", ".json.gz"),
 }[task_type]
 print(f"\nJob started running {timestamp}")
 print(f"{data_path=}")
 e_form_col = "e_form_per_atom_mp2020_corrected"
 assert e_form_col in df_wbm, f"{e_form_col=} not in {list(df_wbm)=}"
 
-df_in = pd.read_json(data_path).set_index("material_id")
+df_in: pd.DataFrame = np.array_split(
+    pd.read_json(data_path).set_index("material_id"), slurm_array_task_count
+)[slurm_array_task_id - 1]
 megnet_mp_e_form = load_model(model_name := "Eform_MP_2019")
 
 
@@ -77,15 +86,17 @@
 
 
 # %%
-input_col = {"IS2RE": "initial_structure", "RS2RE": "relaxed_structure"}[task_type]
+input_col = {"IS2RE": "initial_structure", "RS2RE": "relaxed_structure"}.get(
+    task_type, task_type  # input_col=task_type for CHGNet and M3GNet
+)
 
 if task_type == "RS2RE":
     df_in[input_col] = [x["structure"] for x in df_in.computed_structure_entry]
 
 structures = df_in[input_col].map(Structure.from_dict).to_dict()
 
 megnet_e_form_preds = {}
-for material_id in tqdm(structures, disable=None):
+for material_id in tqdm(structures):
     if material_id in megnet_e_form_preds:
         continue
     try:
 
@@ -59,10 +59,9 @@
     raise SystemExit(f"{out_path = } already exists, exciting early")
 
 print(f"{data_path=}")
-df = pd.read_json(data_path).set_index("material_id")
-df_in: pd.DataFrame = np.array_split(df, slurm_array_task_count)[
-    slurm_array_task_id - 1
-]
+df_in: pd.DataFrame = np.array_split(
+    pd.read_json(data_path).set_index("material_id"), slurm_array_task_count
+)[slurm_array_task_id - 1]
 
 if data_name == "mp":  # extract structure dicts from ComputedStructureEntry
     struct_dicts = [x["structure"] for x in df_in.entry]
 
@@ -35,10 +35,10 @@
         ),
     ),
     "CHGNet": dict(
-        n_runs=100,
+        n_runs=102,
         filters=dict(
             display_name={"$regex": "chgnet-wbm-IS2RE-"},
-            created_at={"$lt": "2023-03-03"},
+            created_at={"$gt": "2023-03-05", "$lt": "2023-03-07"},
         ),
     ),
     "CGCNN": dict(
@@ -155,6 +155,8 @@
 }
 styler.set_table_styles([dict(selector=sel, props=styles[sel]) for sel in styles])
 styler.set_uuid("")
+# hide redundant metrics (TPR = Recall, FPR = 1 - TNR, FNR = 1 - TPR)
+styler.hide(["Recall", "FPR", "FNR"], axis=1)
 
 
 # %% export model metrics as styled HTML table
@@ -183,8 +185,7 @@
 
 df_stats.attrs["Total Run Time"] = df_stats[time_col].sum()
 
-stats_out = f"{MODELS}/model-stats.json"
-df_stats.round(2).to_json(stats_out, orient="index")
+df_stats.round(2).to_json(f"{MODELS}/model-stats.json", orient="index")
 
 
 # %% plot model run times as pie chart
 
@@ -48,7 +48,7 @@
     )
     fig.update_traces(line=dict(width=3))
     for trace in fig.data:
-        if trace.name in df_metrics.T.sort_values("F1").index[6:]:
+        if trace.name in df_metrics.T.sort_values("F1").index[:-6]:
             trace.visible = "legendonly"  # show only top models by default
         last_idx = pd.Series(trace.y).last_valid_index()
         last_x = trace.x[last_idx]
Original file line number	Diff line number	Diff line change
`@@ -486,6 +486,8 @@ def rolling_mae_vs_hull_dist(`
`486`	`486`	`yanchor="bottom",`
`487`	`487`	`title_font=dict(size=15),`
`488`	`488`	`)`
	`489`	`+ # change tooltip precision to 2 decimal places`
	`490`	`+ ax.update_traces(hovertemplate="x = %{x:.2f} eV/atom<br>y = %{y:.2f} eV/atom")`
`489`	`491`	`ax.layout.xaxis.title.text = "E<sub>above MP hull</sub> (eV/atom)"`
`490`	`492`	`ax.layout.yaxis.title.text = "rolling MAE (eV/atom)"`
`491`	`493`	`ax.update_xaxes(range=x_lim)`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-model_name: [CHGNet, CHGNet + MEGNet]`
	`1`	`+model_name: CHGNet`
`2`	`2`	`model_version: 0.0.1`
`3`	`3`	`matbench_discovery_version: 1.0`
`4`	`4`	`date_added: "2023-03-03"`
Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,7 @@`
`48`	`48`	`)`
`49`	`49`	`fig.update_traces(line=dict(width=3))`
`50`	`50`	`for trace in fig.data:`
`51`		`- if trace.name in df_metrics.T.sort_values("F1").index[6:]:`
	`51`	`+ if trace.name in df_metrics.T.sort_values("F1").index[:-6]:`
`52`	`52`	`trace.visible = "legendonly" # show only top models by default`
`53`	`53`	`last_idx = pd.Series(trace.y).last_valid_index()`
`54`	`54`	`last_x = trace.x[last_idx]`