include legacy MP energy corrections in data/wbm/2022-10-19-wbm-summary.csv, use them to remove old and apply new corrections in test_megnet.py

janosh · janosh · commit 81668016b610 · 2023-06-19T20:29:24.000-07:00
diff --git a/data/wbm/fetch_process_wbm_dataset.py b/data/wbm/fetch_process_wbm_dataset.py
@@ -10,7 +10,10 @@
 from aviary.wren.utils import get_aflow_label_from_spglib
 from pymatgen.analysis.phase_diagram import PatchedPhaseDiagram
 from pymatgen.core import Composition, Structure
-from pymatgen.entries.compatibility import MaterialsProject2020Compatibility
+from pymatgen.entries.compatibility import (
+    MaterialsProject2020Compatibility,
+    MaterialsProjectCompatibility,
+)
 from pymatgen.entries.computed_entries import ComputedStructureEntry
 from pymatviz import density_scatter
 from pymatviz.utils import save_fig
@@ -184,7 +187,6 @@ def increment_wbm_material_id(wbm_id: str) -> str:
 cse_step_paths = sorted(glob(f"{module_dir}/raw/wbm-cse-step-*.json.bz2"))
 assert len(cse_step_paths) == 5
 
-
 """
 There is a discrepancy of 6 entries between the files on Materials Cloud containing the
 ComputedStructureEntries (CSE) and those on Google Drive containing initial+relaxed
@@ -496,10 +498,25 @@ def fix_bad_struct_index_mismatch(material_id: str) -> str:
 assert all(df_summary.n_sites == [len(cse.structure) for cse in df_wbm.cse])
 
 
-compat_out = MaterialsProject2020Compatibility().process_entries(
-    entries=df_wbm.cse, clean=True, verbose=True
+# entries are corrected in-place by default so we apply legacy corrections first
+# and then leave the new corrections in place below
+# having both old and new corrections allows updating predictions from older models
+# like MEGNet that were trained on MP release prior to new corrections by subtracting
+# old corrections and adding the new ones
+entries_old_corr = MaterialsProjectCompatibility().process_entries(
+    df_wbm.cse, clean=True, verbose=True
+)
+assert len(entries_old_corr) == len(df_wbm), f"{len(entries_old_corr)=} {len(df_wbm)=}"
+
+# extract legacy MP energy corrections to df_megnet
+e_correction_col = "e_correction_per_atom_mp_legacy"
+df_wbm[e_correction_col] = [cse.correction_per_atom for cse in df_wbm.cse]
+
+# clean up legacy corrections and apply new corrections
+entries_new_corr = MaterialsProject2020Compatibility().process_entries(
+    df_wbm.cse, clean=True, verbose=True
 )
-assert len(compat_out) == len(df_wbm) == len(df_summary)
+assert len(entries_new_corr) == len(df_wbm), f"{len(entries_new_corr)=} {len(df_wbm)=}"
 
 n_corrected = sum(cse.uncorrected_energy != cse.energy for cse in df_wbm.cse)
 assert n_corrected == 100_930, f"{n_corrected=} expected 100,930"
diff --git a/matbench_discovery/plots.py b/matbench_discovery/plots.py
@@ -55,6 +55,7 @@
     m3gnet_megnet="M3GNet + MEGNet",
     m3gnet="M3GNet",
     megnet="MEGNet",
+    megnet_old="MEGNet Old",
     voronoi_rf="Voronoi Random Forest",
     wrenformer="Wrenformer",
     dft="DFT",
diff --git a/models/m3gnet/2022-10-31-m3gnet-wbm-IS2RE.csv b/models/m3gnet/2022-10-31-m3gnet-wbm-IS2RE.csv
diff --git a/models/m3gnet/join_m3gnet_results.py b/models/m3gnet/join_m3gnet_results.py
@@ -19,7 +19,7 @@
 from tqdm import tqdm
 
 from matbench_discovery import today
-from matbench_discovery.data import DATA_FILES, as_dict_handler
+from matbench_discovery.data import DATA_FILES, as_dict_handler, df_wbm
 from matbench_discovery.energy import get_e_form_per_atom
 
 __author__ = "Janosh Riebesell"
@@ -124,7 +124,15 @@
     except Exception as exc:
         print(f"Failed to predict {material_id=}: {exc}")
 
-df_m3gnet["e_form_per_atom_m3gnet_megnet"] = pd.Series(megnet_e_form_preds)
+pred_col_megnet = "e_form_per_atom_m3gnet_megnet"
+df_m3gnet[f"{pred_col_megnet}_old"] = pd.Series(megnet_e_form_preds)
+# remove legacy MP corrections that MEGNet was trained on and apply newer MP2020
+# corrections instead
+df_m3gnet[pred_col_megnet] = (
+    df_m3gnet[f"{pred_col_megnet}_old"]
+    - df_wbm.e_correction_per_atom_mp_legacy
+    + df_wbm.e_correction_per_atom_mp2020
+)
 
 assert (
     n_isna := df_m3gnet.e_form_per_atom_m3gnet_megnet.isna().sum()
@@ -145,5 +153,5 @@
 df_m3gnet.select_dtypes("number").to_csv(out_path.replace(".json.gz", ".csv"))
 
 # in_path = f"{module_dir}/2022-10-31-m3gnet-wbm-IS2RE.json.gz"
-# df_m3gnet_csv = pd.read_csv(in_path.replace(".json.gz", ".csv"))
+# df_m3gnet = pd.read_csv(in_path.replace(".json.gz", ".csv")).set_index("material_id")
 # df_m3gnet = pd.read_json(in_path).set_index("material_id")
diff --git a/models/megnet/test_megnet.py b/models/megnet/test_megnet.py
@@ -14,6 +14,8 @@
 import pandas as pd
 import wandb
 from megnet.utils.models import load_model
+from pymatgen.core import Structure
+from pymatgen.entries.computed_entries import ComputedStructureEntry
 from sklearn.metrics import r2_score
 from tqdm import tqdm
 
@@ -75,24 +77,19 @@
 
 # %%
 if task_type == "IS2RE":
-    from pymatgen.core import Structure
-
     structures = df_wbm_structs.initial_structure.map(Structure.from_dict)
 elif task_type == "RS2RE":
-    from pymatgen.entries.computed_entries import ComputedStructureEntry
-
     df_wbm_structs.cse = df_wbm_structs.cse.map(ComputedStructureEntry.from_dict)
     structures = df_wbm_structs.cse.map(lambda x: x.structure)
 else:
     raise ValueError(f"Unknown {task_type = }")
 
 megnet_e_form_preds = {}
-for material_id, structure in tqdm(
-    structures.items(), disable=None, total=len(structures)
-):
+for material_id in tqdm(structures, disable=None):
     if material_id in megnet_e_form_preds:
         continue
     try:
+        structure = structures[material_id]
         e_form_per_atom = megnet_mp_e_form.predict_structure(structure)[0]
         megnet_e_form_preds[material_id] = e_form_per_atom
     except Exception as exc:
@@ -104,9 +101,23 @@
 print(f"{len(structures)=:,}")
 print(f"missing: {len(structures) - len(megnet_e_form_preds):,}")
 pred_col = "e_form_per_atom_megnet"
-df_wbm[pred_col] = pd.Series(megnet_e_form_preds)
+# old columns contains direct MEGNet predictions which was trained on legacy-corrected
+# MP formation energies
+df_wbm[f"{pred_col}_old"] = pd.Series(megnet_e_form_preds)
+
+# remove legacy MP corrections that MEGNet was trained on and apply newer MP2020
+# corrections instead
+df_wbm[pred_col] = (
+    df_wbm[pred_col]
+    - df_wbm.e_correction_per_atom_mp_legacy
+    + df_wbm.e_correction_per_atom_mp2020
+)
+
+df_wbm.filter(like=pred_col).round(4).to_csv(
+    "2022-11-18-megnet-wbm-IS2RE/megnet-e-form-preds.csv"
+)
 
-df_wbm[pred_col].round(4).to_csv(out_path)
+# df_megnet = pd.read_csv(f"{ROOT}/models/{PRED_FILES.megnet}").set_index("material_id")
 
 
 # %%
diff --git a/scripts/rolling_mae_vs_hull_dist.py b/scripts/rolling_mae_vs_hull_dist.py
@@ -13,6 +13,8 @@
 # %%
 # model = "Wrenformer"
 model = "M3GNet + MEGNet"
+model = "MEGNet"
+model = "MEGNet Old"
 ax, df_err, df_std = rolling_mae_vs_hull_dist(
     e_above_hull_true=df_wbm[each_true_col],
     e_above_hull_errors={model: df_wbm[e_form_col] - df_wbm[model]},
@@ -21,8 +23,8 @@
     # template="plotly_white",
 )
 
-MAE, DAF = df_metrics[model].MAE, df_metrics[model].DAF
-title = f"{today} {model} · {MAE=:.2f} · {DAF=:.2f}"
+MAE, DAF, F1 = df_metrics[model][["MAE", "DAF", "F1"]]
+title = f"{today} {model} · {MAE=:.2f} · {DAF=:.2f} · {F1=:.2f}"
 if backend == "matplotlib":
     fig = ax.figure
     fig.set_size_inches(6, 5)
diff --git a/site/src/routes/contribute/+page.md b/site/src/routes/contribute/+page.md
@@ -35,7 +35,7 @@ assert sorted(DATA_FILES) == [
 
 df_wbm = load_train_test("wbm-summary", version="v1.0.0")
 
-assert df_wbm.shape == (256963, 14)
+assert df_wbm.shape == (256963, 15)
 
 assert list(df_wbm) == [
     "formula",
@@ -47,6 +47,7 @@ assert list(df_wbm) == [
     "bandgap_pbe",
     "uncorrected_energy_from_cse",
     "e_correction_per_atom_mp2020",
+    "e_correction_per_atom_mp_legacy",
     "e_above_hull_mp2020_corrected_ppd_mp",
     "e_form_per_atom_uncorrected",
     "e_form_per_atom_mp2020_corrected",
@@ -65,6 +66,8 @@ assert list(df_wbm) == [
 1. `bandgap_pbe`: PBE-level DFT band gap from [WBM paper]
 1. `uncorrected_energy_from_cse`: Should be the same as `uncorrected_energy`. There are 2 cases where the absolute difference reported in the summary file and in the computed structure entries exceeds 0.1 eV (`wbm-2-3218`, `wbm-1-56320`) which we attribute to rounding errors.
 1. `e_form_per_atom_mp2020_corrected`: Matbench Discovery takes these as ground truth for the formation energy. Includes MP2020 energy corrections (latest correction scheme at time of release).
+1. `e_correction_per_atom_mp2020`: [`MaterialsProject2020Compatibility`](https://pymatgen.org/pymatgen.entries.compatibility.html#pymatgen.entries.compatibility.MaterialsProject2020Compatibility) energy corrections in eV/atom.
+1. `e_correction_per_atom_mp_legacy`: Legacy [`MaterialsProjectCompatibility`](https://pymatgen.org/pymatgen.entries.compatibility.html#pymatgen.entries.compatibility.MaterialsProjectCompatibility) energy corrections in eV/atom. Having both old and new corrections allows updating predictions from older models like MEGNet that were trained on MP formation energies treated with the old correction scheme.
 1. `e_above_hull_mp2020_corrected_ppd_mp`: Energy above hull distances in eV/atom after applying the MP2020 correction scheme. The convex hull in question is the one spanned by all ~145k Materials Project `ComputedStructureEntries`. Matbench Discovery takes these as ground truth for material stability. Any value above 0 is assumed to be an unstable/metastable material.
 <!-- TODO document remaining columns, or maybe drop them from df -->
 
diff --git a/tests/test_data.py b/tests/test_data.py
@@ -173,7 +173,7 @@ def as_dict(self) -> dict[str, Any]:
 
 
 def test_df_wbm() -> None:
-    assert df_wbm.shape == (256963, 14)
+    assert df_wbm.shape == (256963, 15)
     assert df_wbm.index.name == "material_id"
     assert set(df_wbm) > {"bandgap_pbe", "formula", "material_id"}