extract plot func rolling_mae_vs_hull_dist() from rolling MAE plot scripts

janosh · janosh · commit 8d65a7aad620 · 2023-06-19T20:29:21.000-07:00
diff --git a/mb_discovery/plot_scripts/hist_classified_stable_as_func_of_hull_dist.py b/mb_discovery/plot_scripts/hist_classified_stable_as_func_of_hull_dist.py
@@ -29,7 +29,7 @@
 
 plt.rc("savefig", bbox="tight", dpi=200)
 plt.rcParams["figure.constrained_layout.use"] = True
-plt.rc("figure", dpi=150)
+plt.rc("figure", dpi=200)
 plt.rc("font", size=16)
 
 
diff --git a/mb_discovery/plot_scripts/hist_classified_stable_as_func_of_hull_dist_batches.py b/mb_discovery/plot_scripts/hist_classified_stable_as_func_of_hull_dist_batches.py
@@ -28,7 +28,7 @@
 
 plt.rc("savefig", bbox="tight", dpi=200)
 plt.rcParams["figure.constrained_layout.use"] = True
-plt.rc("figure", dpi=150)
+plt.rc("figure", dpi=200)
 plt.rc("font", size=16)
 
 
diff --git a/mb_discovery/plot_scripts/plot_funcs.py b/mb_discovery/plot_scripts/plot_funcs.py
@@ -1,9 +1,12 @@
 from __future__ import annotations
 
-from typing import Literal, Sequence
+from typing import Any, Literal, Sequence
 
 import matplotlib.pyplot as plt
+import numpy as np
 import pandas as pd
+from mpl_toolkits.axes_grid1.anchored_artists import AnchoredSizeBar
+from scipy.stats import sem as std_err_of_mean
 
 
 __author__ = "Janosh Riebesell"
@@ -12,7 +15,7 @@
 
 plt.rc("savefig", bbox="tight", dpi=200)
 plt.rcParams["figure.constrained_layout.use"] = True
-plt.rc("figure", dpi=150)
+plt.rc("figure", dpi=200)
 plt.rc("font", size=16)
 
 
@@ -137,3 +140,112 @@ def hist_classified_stable_as_func_of_hull_dist(
     ax.set(xlabel=xlabel, ylabel="Number of compounds")
 
     return ax
+
+
+def rolling_mae_vs_hull_dist(
+    df: pd.DataFrame,
+    e_above_hull_col: str,
+    residual_col: str = "residual",
+    half_window: float = 0.02,
+    increment: float = 0.002,
+    x_lim: tuple[float, float] = (-0.2, 0.3),
+    ax: plt.Axes = None,
+    **kwargs: Any,
+) -> plt.Axes:
+    """Rolling mean absolute error as the energy to the convex hull is varied. A scale
+    bar is shown for the windowing period of 40 meV per atom used when calculating
+    the rolling MAE. The standard error in the mean is shaded
+    around each curve. The highlighted V-shaped region shows the area in which the
+    average absolute error is greater than the energy to the known convex hull. This is
+    where models are most at risk of misclassifying structures.
+    """
+    if ax is None:
+        ax = plt.gca()
+
+    ax_is_fresh = len(ax.lines) == 0
+
+    bins = np.arange(*x_lim, increment)
+
+    rolling_maes = np.zeros_like(bins)
+    rolling_stds = np.zeros_like(bins)
+    df = df.sort_values(by=e_above_hull_col)
+    for idx, bin_center in enumerate(bins):
+        low = bin_center - half_window
+        high = bin_center + half_window
+
+        mask = (df[e_above_hull_col] <= high) & (df[e_above_hull_col] > low)
+        rolling_maes[idx] = df[residual_col].loc[mask].abs().mean()
+        rolling_stds[idx] = std_err_of_mean(df[residual_col].loc[mask].abs())
+
+    ax.plot(bins, rolling_maes, **kwargs)
+
+    ax.fill_between(
+        bins, rolling_maes + rolling_stds, rolling_maes - rolling_stds, alpha=0.3
+    )
+
+    if not ax_is_fresh:
+        # return earlier if all plot objects besides the line were already drawn by a
+        # previous call
+        return ax
+
+    scale_bar = AnchoredSizeBar(
+        ax.transData,
+        2 * half_window,
+        "40 meV",
+        "lower left",
+        pad=0.5,
+        frameon=False,
+        size_vertical=0.002,
+    )
+
+    ax.add_artist(scale_bar)
+
+    ax.plot((0.05, 0.5), (0.05, 0.5), color="grey", linestyle="--", alpha=0.3)
+    ax.plot((-0.5, -0.05), (0.5, 0.05), color="grey", linestyle="--", alpha=0.3)
+    ax.plot((-0.05, 0.05), (0.05, 0.05), color="grey", linestyle="--", alpha=0.3)
+    ax.plot((-0.1, 0.1), (0.1, 0.1), color="grey", linestyle="--", alpha=0.3)
+
+    ax.fill_between(
+        (-0.5, -0.05, 0.05, 0.5),
+        (0.5, 0.5, 0.5, 0.5),
+        (0.5, 0.05, 0.05, 0.5),
+        color="tab:red",
+        alpha=0.2,
+    )
+
+    ax.plot((0, 0.05), (0, 0.05), color="grey", linestyle="--", alpha=0.3)
+    ax.plot((-0.05, 0), (0.05, 0), color="grey", linestyle="--", alpha=0.3)
+
+    ax.fill_between(
+        (-0.05, 0, 0.05),
+        (0.05, 0.05, 0.05),
+        (0.05, 0, 0.05),
+        color="tab:orange",
+        alpha=0.2,
+    )
+
+    arrowprops = dict(facecolor="black", width=0.5, headwidth=5, headlength=5)
+    ax.annotate(
+        xy=(0.055, 0.05),
+        xytext=(0.12, 0.05),
+        arrowprops=arrowprops,
+        text="Corrected\nGGA DFT\nAccuracy",
+        verticalalignment="center",
+        horizontalalignment="left",
+    )
+    ax.annotate(
+        xy=(0.105, 0.1),
+        xytext=(0.16, 0.1),
+        arrowprops=arrowprops,
+        text="GGA DFT\nAccuracy",
+        verticalalignment="center",
+        horizontalalignment="left",
+    )
+
+    ax.text(0, 0.13, r"$|\Delta E_{Hull-MP}| > $MAE", horizontalalignment="center")
+
+    ax.set(xlabel=r"$\Delta E_{Hull-MP}$ / eV per atom", ylabel="MAE / eV per atom")
+
+    ax.set(xlim=x_lim, ylim=(0.0, 0.14))
+
+    return ax
diff --git a/mb_discovery/plot_scripts/precision_recall_vs_calc_count.py b/mb_discovery/plot_scripts/precision_recall_vs_calc_count.py
@@ -16,7 +16,7 @@
 
 plt.rc("savefig", bbox="tight", dpi=200)
 plt.rcParams["figure.constrained_layout.use"] = True
-plt.rc("figure", dpi=150)
+plt.rc("figure", dpi=200)
 plt.rc("font", size=16)
 
 
diff --git a/mb_discovery/plot_scripts/rolling_mae_vs_hull_dist.py b/mb_discovery/plot_scripts/rolling_mae_vs_hull_dist.py
@@ -0,0 +1,73 @@
+# %%
+from datetime import datetime
+
+import matplotlib.pyplot as plt
+import pandas as pd
+
+from mb_discovery import ROOT
+from mb_discovery.plot_scripts.plot_funcs import rolling_mae_vs_hull_dist
+
+
+__author__ = "Rhys Goodall, Janosh Riebesell"
+__date__ = "2022-06-18"
+
+today = f"{datetime.now():%Y-%m-%d}"
+
+plt.rc("savefig", bbox="tight", dpi=200)
+plt.rcParams["figure.constrained_layout.use"] = True
+plt.rc("figure", dpi=200)
+plt.rc("font", size=16)
+
+
+# %%
+markers = ["o", "v", "^", "H", "D", ""]
+
+df = pd.read_csv(
+    f"{ROOT}/data/2022-06-11-from-rhys/wren-mp-initial-structures.csv"
+    # f"{ROOT}/data/2022-08-16-wrenformer-ensemble-predictions.csv.bz2"
+).set_index("material_id")
+
+
+# %%
+rare = "all"
+# from pymatgen.core import Composition
+# rare = "no-lanthanides"
+# df["contains_rare_earths"] = df.composition.map(
+#     lambda x: any(el.is_rare_earth_metal for el in Composition(x))
+# )
+# df = df.query("~contains_rare_earths")
+
+
+df_hull = pd.read_csv(
+    f"{ROOT}/data/2022-06-11-from-rhys/wbm-e-above-mp-hull.csv"
+).set_index("material_id")
+
+df["e_above_mp_hull"] = df_hull.e_above_mp_hull
+
+assert (n_nans := df.isna().sum().sum()) == 0, f"Found {n_nans} NaNs"
+
+target_col = "e_form_target"
+# --- or ---
+# target_col = "e_form_per_atom_target"
+# df["e_form_per_atom_target"] = df.e_form / df.n_sites
+
+# make sure we average the expected number of ensemble member predictions
+assert df.filter(regex=r"_pred_\d").shape[1] == 10
+
+df["e_form_pres_ens"] = df.filter(regex=r"_pred_\d+").mean(axis=1)
+df["e_above_mp_hull_pred"] = df.e_form_pres_ens - df[target_col] + df.e_above_mp_hull
+
+df["residual"] = df.e_above_mp_hull_pred - df.e_above_mp_hull
+
+
+# %%
+ax = rolling_mae_vs_hull_dist(
+    df,
+    e_above_hull_col="e_above_mp_hull",
+    residual_col="residual",
+)
+
+ax.figure.set_size_inches(10, 9)
+
+img_path = f"{ROOT}/figures/{today}-rolling-mae-vs-hull-dist-{rare=}.pdf"
+# plt.savefig(img_path)
diff --git a/mb_discovery/plot_scripts/rolling_mae_vs_hull_dist_wbm_batches.py b/mb_discovery/plot_scripts/rolling_mae_vs_hull_dist_wbm_batches.py
@@ -0,0 +1,76 @@
+# %%
+from datetime import datetime
+
+import matplotlib.pyplot as plt
+import pandas as pd
+
+from mb_discovery import ROOT
+from mb_discovery.plot_scripts.plot_funcs import rolling_mae_vs_hull_dist
+
+
+__author__ = "Rhys Goodall, Janosh Riebesell"
+__date__ = "2022-06-18"
+
+today = f"{datetime.now():%Y-%m-%d}"
+
+
+plt.rc("savefig", bbox="tight", dpi=200)
+plt.rcParams["figure.constrained_layout.use"] = True
+plt.rc("figure", dpi=200)
+plt.rc("font", size=16)
+
+
+# %%
+rare = "all"
+
+df_wbm = pd.read_csv(
+    f"{ROOT}/data/2022-06-11-from-rhys/wren-mp-initial-structures.csv"
+).set_index("material_id")
+
+df_hull = pd.read_csv(
+    f"{ROOT}/data/2022-06-11-from-rhys/wbm-e-above-mp-hull.csv"
+).set_index("material_id")
+
+df_wbm["e_above_mp_hull"] = df_hull.e_above_mp_hull
+assert df_wbm.e_above_mp_hull.isna().sum() == 0
+
+target_col = "e_form_target"
+
+# make sure we average the expected number of ensemble member predictions
+assert df_wbm.filter(regex=r"_pred_\d").shape[1] == 10
+
+df_wbm["e_above_mp_hull_pred"] = (
+    df_wbm.filter(regex=r"_pred_\d").mean(axis=1)
+    - df_wbm[target_col]
+    + df_wbm.e_above_mp_hull
+)
+df_wbm["error"] = abs(df_wbm.e_above_mp_hull_pred - df_wbm.e_above_mp_hull)
+
+
+# %%
+fig, ax = plt.subplots(1, figsize=(10, 9))
+markers = ("o", "v", "^", "H", "D")
+assert len(markers) == 5  # number of WBM rounds of element substitution
+
+for idx, marker in enumerate(markers, 1):
+    title = f"Batch {idx}"
+    df = df_wbm[df_wbm.index.str.startswith(f"wbm-step-{idx}")]
+
+    rolling_mae_vs_hull_dist(
+        df,
+        residual_col="error",
+        e_above_hull_col="e_above_mp_hull",
+        ax=ax,
+        label=title,
+        marker=marker,
+        markevery=20,
+        markerfacecolor="white",
+        markeredgewidth=2.5,
+    )
+
+
+ax.legend(loc="lower right", frameon=False)
+
+
+img_path = f"{ROOT}/figures/{today}-rolling-mae-vs-hull-dist-wbm-batches-{rare=}.pdf"
+# plt.savefig(img_path)