refactor plot func hist_classified_stable_as_func_of_hull_dist()

janosh · janosh · commit 967b482a1008 · 2023-06-19T20:29:21.000-07:00
add script mb_discovery/plot_scripts/hist_classified_stable_as_func_of_hull_dist_batches.py
diff --git a/mb_discovery/plot_scripts/hist_classified_stable_as_func_of_hull_dist.py b/mb_discovery/plot_scripts/hist_classified_stable_as_func_of_hull_dist.py
@@ -0,0 +1,82 @@
+# %%
+from datetime import datetime
+from typing import Literal
+
+import matplotlib.pyplot as plt
+import pandas as pd
+
+from mb_discovery import ROOT
+from mb_discovery.plot_scripts.plot_funcs import (
+    hist_classified_stable_as_func_of_hull_dist,
+)
+
+
+__author__ = "Rhys Goodall, Janosh Riebesell"
+__date__ = "2022-06-18"
+
+"""
+Histogram of the energy difference (either according to DFT ground truth [default] or
+model predicted energy) to the convex hull for materials in the WBM data set. The
+histogram is broken down into true positives, false negatives, false positives, and true
+negatives based on whether the model predicts candidates to be below the known convex
+hull. Ideally, in discovery setting a model should exhibit high recall, i.e. the
+majority of materials below the convex hull being correctly identified by the model.
+
+See fig. S1 in https://science.org/doi/10.1126/sciadv.abn4117.
+"""
+
+today = f"{datetime.now():%Y-%m-%d}"
+
+plt.rc("savefig", bbox="tight", dpi=200)
+plt.rcParams["figure.constrained_layout.use"] = True
+plt.rc("figure", dpi=150)
+plt.rc("font", size=16)
+
+
+# %%
+df = pd.read_csv(
+    f"{ROOT}/data/2022-06-11-from-rhys/wren-mp-initial-structures.csv"
+).set_index("material_id")
+
+df_hull = pd.read_csv(
+    f"{ROOT}/data/2022-06-11-from-rhys/wbm-e-above-mp-hull.csv"
+).set_index("material_id")
+
+df["e_above_mp_hull"] = df_hull.e_above_mp_hull
+
+# download wbm-steps-summary.csv (23.31 MB)
+df_summary = pd.read_csv(
+    "https://figshare.com/ndownloader/files/36714216?private_link=ff0ad14505f9624f0c05"
+).set_index("material_id")
+
+
+# %%
+nan_counts = df.isna().sum()
+assert all(nan_counts == 0), f"df should not have missing values: {nan_counts}"
+
+target_col = "e_form_target"
+criterion: Literal["energy", "std", "neg_std"] = "energy"
+energy_type: Literal["true", "pred"] = "true"
+
+
+# make sure we average the expected number of ensemble member predictions
+pred_cols = df.filter(regex=r"_pred_\d").columns
+assert len(pred_cols) == 10
+
+ax = hist_classified_stable_as_func_of_hull_dist(
+    df,
+    target_col,
+    pred_cols,
+    e_above_hull_col="e_above_mp_hull",
+    energy_type=energy_type,
+    criterion=criterion,
+)
+
+ax.figure.set_size_inches(10, 9)
+
+ax.legend(loc="upper left", frameon=False)
+
+img_path = (
+    f"{ROOT}/figures/{today}-wren-wbm-hull-dist-hist-{energy_type=}-{criterion=}.pdf"
+)
+# plt.savefig(img_path)
diff --git a/mb_discovery/plot_scripts/hist_classified_stable_as_func_of_hull_dist_batches.py b/mb_discovery/plot_scripts/hist_classified_stable_as_func_of_hull_dist_batches.py
@@ -0,0 +1,85 @@
+# %%
+from datetime import datetime
+
+import matplotlib.pyplot as plt
+import pandas as pd
+
+from mb_discovery import ROOT
+from mb_discovery.plot_scripts.plot_funcs import (
+    hist_classified_stable_as_func_of_hull_dist,
+)
+
+
+__author__ = "Rhys Goodall, Janosh Riebesell"
+__date__ = "2022-08-25"
+
+"""
+Histogram of the energy difference (either according to DFT ground truth [default] or
+model predicted energy) to the convex hull for materials in the WBM data set. The
+histogram is broken down into true positives, false negatives, false positives, and true
+negatives based on whether the model predicts candidates to be below the known convex
+hull. Ideally, in discovery setting a model should exhibit high recall, i.e. the
+majority of materials below the convex hull being correctly identified by the model.
+
+See fig. S1 in https://science.org/doi/10.1126/sciadv.abn4117.
+"""
+
+today = f"{datetime.now():%Y-%m-%d}"
+
+plt.rc("savefig", bbox="tight", dpi=200)
+plt.rcParams["figure.constrained_layout.use"] = True
+plt.rc("figure", dpi=150)
+plt.rc("font", size=16)
+
+
+# %%
+df = pd.read_csv(
+    f"{ROOT}/data/2022-06-11-from-rhys/wren-mp-initial-structures.csv"
+).set_index("material_id")
+
+df_hull = pd.read_csv(
+    f"{ROOT}/data/2022-06-11-from-rhys/wbm-e-above-mp-hull.csv"
+).set_index("material_id")
+
+df["e_above_mp_hull"] = df_hull.e_above_mp_hull
+
+# download wbm-steps-summary.csv (23.31 MB)
+df_summary = pd.read_csv(
+    "https://figshare.com/ndownloader/files/36714216?private_link=ff0ad14505f9624f0c05"
+).set_index("material_id")
+
+
+# %%
+assert df.e_above_mp_hull.isna().sum() == 0
+
+energy_type = "true"
+criterion = "energy"
+df["wbm_batch"] = df.index.str.split("-").str[2]
+fig, axs = plt.subplots(2, 3, figsize=(18, 9))
+
+# make sure we average the expected number of ensemble member predictions
+pred_cols = df.filter(regex=r"_pred_\d").columns
+assert len(pred_cols) == 10
+
+common_kwargs = dict(
+    target_col="e_form_target",
+    pred_cols=pred_cols,
+    energy_type=energy_type,
+    criterion=criterion,
+    e_above_hull_col="e_above_mp_hull",
+)
+
+for (batch_idx, batch_df), ax in zip(df.groupby("wbm_batch"), axs.flat):
+    hist_classified_stable_as_func_of_hull_dist(batch_df, ax=ax, **common_kwargs)
+
+    title = f"Batch {batch_idx} ({len(df):,})"
+    ax.set(title=title)
+
+
+hist_classified_stable_as_func_of_hull_dist(df, ax=axs.flat[-1], **common_kwargs)
+
+axs.flat[-1].set(title=f"Combined {batch_idx} ({len(df):,})")
+axs.flat[0].legend(frameon=False, loc="upper left")
+
+img_name = f"{today}-wren-wbm-hull-dist-hist-{energy_type=}-{criterion=}.pdf"
+# plt.savefig(f"{ROOT}/figures/{img_name}")
diff --git a/mb_discovery/plot_scripts/plot_funcs.py b/mb_discovery/plot_scripts/plot_funcs.py
@@ -1,42 +1,33 @@
-# %%
-from typing import Literal
+from __future__ import annotations
+
+from typing import Literal, Sequence
 
 import matplotlib.pyplot as plt
 import pandas as pd
-from matplotlib.offsetbox import AnchoredText
 
 
 __author__ = "Janosh Riebesell"
 __date__ = "2022-08-05"
 
-"""
-Histogram of the energy difference (either according to DFT ground truth [default] or
-model predicted energy) to the convex hull for materials in the WBM data set. The
-histogram is broken down into true positives, false negatives, false positives, and true
-negatives based on whether the model predicts candidates to be below the known convex
-hull. Ideally, in discovery setting a model should exhibit high recall, i.e. the
-majority of materials below the convex hull being correctly identified by the model.
-
-See fig. S1 in https://science.org/doi/10.1126/sciadv.abn4117.
-"""
-
 
 plt.rc("savefig", bbox="tight", dpi=200)
 plt.rcParams["figure.constrained_layout.use"] = True
 plt.rc("figure", dpi=150)
 plt.rc("font", size=16)
 
 
-def hist_classify_stable_as_func_of_hull_dist(
-    # df: pd.DataFrame,
-    formation_energy_targets: pd.Series,
-    formation_energy_preds: pd.Series,
-    e_above_hull_vals: pd.Series,
-    rare: str = "all",
-    std_vals: pd.Series = None,
-    criterion: Literal["energy", "std", "neg"] = "energy",
+def hist_classified_stable_as_func_of_hull_dist(
+    df: pd.DataFrame,
+    target_col: str,
+    pred_cols: Sequence[str],
+    e_above_hull_col: str,
+    ax: plt.Axes = None,
     energy_type: Literal["true", "pred"] = "true",
-    annotate_all_stats: bool = False,
+    criterion: Literal["energy", "std", "neg_std"] = "energy",
+    show_mae: bool = False,
+    stability_thresh: float = 0,  # set stability threshold as distance to convex hull
+    # in eV / atom, usually 0 or 0.1 eV
+    x_lim: tuple[float, float] = (-0.4, 0.4),
 ) -> plt.Axes:
     """
     Histogram of the energy difference (either according to DFT ground truth [default]
@@ -49,40 +40,43 @@ def hist_classify_stable_as_func_of_hull_dist(
 
     See fig. S1 in https://science.org/doi/10.1126/sciadv.abn4117.
 
-
-    # NOTE this figure plots hist bars separately which causes aliasing in pdf
-    # to resolve this take into Inkscape and merge regions by color
+    NOTE this figure plots hist bars separately which causes aliasing in pdf
+    to resolve this take into Inkscape and merge regions by color
     """
-    assert e_above_hull_vals.isna().sum() == 0
+    if ax is None:
+        ax = plt.gca()
 
-    error = formation_energy_preds - formation_energy_targets
+    error = df[pred_cols].mean(axis=1) - df[target_col]
+    e_above_hull_vals = df[e_above_hull_col]
     mean = error + e_above_hull_vals
 
-    test = mean
+    if criterion == "energy":
+        test = mean
+    elif "std" in criterion:
+        # TODO column names to compute standard deviation from are currently hardcoded
+        # needs to be updated when adding non-aviary models with uncertainty estimation
+        var_aleatoric = (df.filter(like="_ale_") ** 2).mean(axis=1)
+        var_epistemic = df.filter(regex=r"_pred_\d").var(axis=1, ddof=0)
+        std_total = (var_epistemic + var_aleatoric) ** 0.5
 
-    if std_vals is not None:
         if criterion == "std":
-            test += std_vals
-        elif criterion == "neg":
-            test -= std_vals
+            test += std_total
+        elif criterion == "neg_std":
+            test -= std_total
 
-    xlim = (-0.4, 0.4)
-
-    # set stability threshold at on or 0.1 eV / atom above the hull
-    stability_thresh = (0, 0.1)[0]
-
-    actual_pos = e_above_hull_vals <= stability_thresh
-    actual_neg = e_above_hull_vals > stability_thresh
-    model_pos = test <= stability_thresh
-    model_neg = test > stability_thresh
+    # --- histogram by DFT-computed distance to convex hull
+    if energy_type == "true":
+        actual_pos = e_above_hull_vals <= stability_thresh
+        actual_neg = e_above_hull_vals > stability_thresh
+        model_pos = test <= stability_thresh
+        model_neg = test > stability_thresh
 
-    n_true_pos = len(e_above_hull_vals[actual_pos & model_pos])
-    n_false_neg = len(e_above_hull_vals[actual_pos & model_neg])
+        n_true_pos = len(e_above_hull_vals[actual_pos & model_pos])
+        n_false_neg = len(e_above_hull_vals[actual_pos & model_neg])
 
-    n_total_pos = n_true_pos + n_false_neg
+        n_total_pos = n_true_pos + n_false_neg
+        null = n_total_pos / len(e_above_hull_vals)
 
-    # --- histogram by DFT-computed distance to convex hull
-    if energy_type == "true":
         true_pos = e_above_hull_vals[actual_pos & model_pos]
         false_neg = e_above_hull_vals[actual_pos & model_neg]
         false_pos = e_above_hull_vals[actual_neg & model_pos]
@@ -97,12 +91,10 @@ def hist_classify_stable_as_func_of_hull_dist(
         true_neg = mean[actual_neg & model_neg]
         xlabel = r"$\Delta E_{Hull-Pred}$ / eV per atom"
 
-    fig, ax = plt.subplots(1, 1, figsize=(10, 9))
-
     ax.hist(
         [true_pos, false_neg, false_pos, true_neg],
         bins=200,
-        range=xlim,
+        range=x_lim,
         alpha=0.5,
         color=["tab:green", "tab:orange", "tab:red", "tab:blue"],
         label=[
@@ -114,49 +106,34 @@ def hist_classify_stable_as_func_of_hull_dist(
         stacked=True,
     )
 
-    ax.legend(frameon=False, loc="upper left")
-
     n_true_pos, n_false_pos, n_true_neg, n_false_neg = (
         len(true_pos),
         len(false_pos),
         len(true_neg),
         len(false_neg),
     )
     # null = (tp + fn) / (tp + tn + fp + fn)
-    Null = n_total_pos / len(e_above_hull_vals)
-    PPV = n_true_pos / (n_true_pos + n_false_pos)
-    TPR = n_true_pos / n_total_pos
-    F1 = 2 * PPV * TPR / (PPV + TPR)
-
-    assert n_true_pos + n_false_pos + n_true_neg + n_false_neg == len(
-        formation_energy_targets
-    )
-
-    RMSE = (error**2).mean() ** 0.5
-    MAE = error.abs().mean()
-
-    # anno_text = f"Prevalence = {null:.2f}\nPrecision = {ppv:.2f}\nRecall = {tpr:.2f}",
-    anno_text = f"Enrichment Factor = {PPV/Null:.3}"
-    if annotate_all_stats:
-        anno_text += f"\n{MAE = :.3}\n{RMSE = :.3}\n{Null = :.3}\n{TPR = :.3}"
-    else:
-        print(f"{PPV = :.3}")
-        print(f"{TPR = :.3}")
-        print(f"{F1 = :.3}")
-        print(f"Enrich: {PPV/Null:.2f}")
-        print(f"{Null = :.3}")
-        print(f"{MAE = :.3}")
-        print(f"{RMSE = :.3}")
-
-    text_box = AnchoredText(
-        anno_text, loc="upper right", frameon=False, prop=dict(fontsize=16)
+    precision = n_true_pos / (n_true_pos + n_false_pos)
+
+    assert n_true_pos + n_false_pos + n_true_neg + n_false_neg == len(df)
+
+    # recall = n_true_pos / n_total_pos
+    # f"Prevalence = {null:.2f}\n{precision = :.2f}\n{recall = :.2f}",
+    text = f"Enrichment\nFactor = {precision/null:.3}"
+    if show_mae:
+        MAE = error.abs().mean()
+        text += f"\n{MAE = :.3}"
+
+    ax.text(
+        0.98,
+        0.98,
+        text,
+        fontsize=18,
+        verticalalignment="top",
+        horizontalalignment="right",
+        transform=ax.transAxes,
     )
-    ax.add_artist(text_box)
 
-    ax.set(
-        xlabel=xlabel,
-        ylabel="Number of Compounds",
-        title=f"data size = {len(e_above_hull_vals):,}",
-    )
+    ax.set(xlabel=xlabel, ylabel="Number of compounds")
 
     return ax