refactor cumulative_clf_metric()->cumulative_precision_recall() to plot both prec and recall for all models

janosh · janosh · commit 4972c010523f · 2023-06-19T20:29:22.000-07:00
support both plotly and matplotlib as backend
diff --git a/matbench_discovery/energy.py b/matbench_discovery/energy.py
@@ -138,7 +138,7 @@ def classify_stable(
             negative values close to 0 make sense.
 
     Returns:
-        tuple[pd.Series, pd.Series, pd.Series, pd.Series]: Indices for true positives,
+        tuple[TP, FN, FP, TN]: Indices as pd.Series for true positives,
             false negatives, false positives and true negatives (in this order).
     """
     actual_pos = e_above_hull_true <= stability_threshold
diff --git a/matbench_discovery/plots.py b/matbench_discovery/plots.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import math
 from typing import Any, Literal
 
 import matplotlib.pyplot as plt
@@ -80,11 +81,11 @@ def hist_classified_stable_vs_hull_dist(
     ax: plt.Axes = None,
     which_energy: WhichEnergy = "true",
     stability_threshold: float = 0,
-    show_threshold: bool = True,
     x_lim: tuple[float | None, float | None] = (-0.4, 0.4),
     rolling_accuracy: float | None = 0.02,
     backend: Backend = "plotly",
     ylabel: str = "Number of materials",
+    **kwargs: Any,
 ) -> tuple[plt.Axes | go.Figure, dict[str, float]]:
     """
     Histogram of the energy difference (either according to DFT ground truth [default]
@@ -108,13 +109,13 @@ def hist_classified_stable_vs_hull_dist(
             distance or the model's predicted hull distance for the histogram.
         stability_threshold (float, optional): set stability threshold as distance to
             convex hull in eV/atom, usually 0 or 0.1 eV.
-        show_threshold (bool, optional): Whether to plot stability threshold as dashed
-            vertical line.
         x_lim (tuple[float | None, float | None]): x-axis limits.
         rolling_accuracy (float): Rolling accuracy window size in eV / atom. Set to None
             or 0 to disable. Defaults to 0.02, meaning 20 meV / atom.
         backend ('matplotlib' | 'plotly'], optional): Which plotting backend to use.
             Changes the return type.
+        kwargs: Additional keyword arguments passed to the ax.hist() or px.histogram()
+            depending on backend.
 
     Returns:
         tuple[plt.Axes, dict[str, float]]: plot axes and classification metrics
@@ -159,15 +160,17 @@ def hist_classified_stable_vs_hull_dist(
             color=["tab:green", "tab:orange", "tab:red", "tab:blue"],
             label=labels,
             stacked=True,
+            **kwargs,
         )
         ax.set(xlabel=xlabel, ylabel=ylabel, xlim=x_lim)
 
-        ax.axvline(
-            stability_threshold,
-            color="black",
-            linestyle="--",
-            label="Stability Threshold",
-        )
+        if stability_threshold is not None:
+            ax.axvline(
+                stability_threshold,
+                color="black",
+                linestyle="--",
+                label="Stability Threshold",
+            )
 
         if rolling_accuracy:
             # add moving average of the accuracy computed within given window
@@ -203,28 +206,35 @@ def hist_classified_stable_vs_hull_dist(
             # )
 
     if backend == "plotly":
-        clf = (true_pos * 1 + false_neg * 2 + false_pos * 3 + true_neg * 4).map(
+        clf = (true_pos + false_neg * 2 + false_pos * 3 + true_neg * 4).map(
             dict(zip(range(1, 5), labels))
         )
         df = pd.DataFrame(dict(e_above_hull=e_above_hull, clf=clf))
 
         ax = px.histogram(
-            df, x="e_above_hull", color="clf", nbins=20000, range_x=x_lim, opacity=0.9
+            df,
+            x="e_above_hull",
+            color="clf",
+            nbins=20000,
+            range_x=x_lim,
+            opacity=0.9,
+            **kwargs,
         )
         ax.update_layout(
             dict(xaxis_title=xlabel, yaxis_title=ylabel),
             legend=dict(title=None, yanchor="top", y=1, xanchor="right", x=1),
         )
 
-        ax.add_vline(stability_threshold, line=dict(dash="dash", width=1))
-        ax.add_annotation(
-            text="Stability threshold",
-            x=stability_threshold,
-            y=1.1,
-            yref="paper",
-            font=dict(size=14, color="gray"),
-            showarrow=False,
-        )
+        if stability_threshold is not None:
+            ax.add_vline(stability_threshold, line=dict(dash="dash", width=1))
+            ax.add_annotation(
+                text="Stability threshold",
+                x=stability_threshold,
+                y=1.1,
+                yref="paper",
+                font=dict(size=14, color="gray"),
+                showarrow=False,
+            )
 
     recall = n_true_pos / n_total_pos
     return ax, dict(
@@ -341,115 +351,141 @@ def rolling_mae_vs_hull_dist(
     return ax
 
 
-def cumulative_clf_metric(
+def cumulative_precision_recall(
     e_above_hull_true: pd.Series,
-    e_above_hull_pred: pd.Series,
-    metric: Literal["precision", "recall"],
+    df_preds: pd.DataFrame,
     stability_threshold: float = 0,  # set stability threshold as distance to convex
     # hull in eV / atom, usually 0 or 0.1 eV
-    ax: plt.Axes = None,
-    label: str = None,
     project_end_point: AxLine = "xy",
     show_optimal: bool = False,
+    backend: Backend = "plotly",
     **kwargs: Any,
-) -> plt.Axes:
-    """Precision and recall as a function of the number of included materials sorted
-    by model-predicted distance to the convex hull, i.e. materials predicted most stable
-    enter the precision and recall calculation first. The curves end when all materials
-    predicted stable are included.
+) -> tuple[plt.Figure | go.Figure, pd.DataFrame]:
+    """Create 2 subplots side-by-side with cumulative precision and recall curves for
+    all models starting with materials predicted most stable, adding the next material,
+    recomputing the cumulative metrics, adding the next most stable material and so on
+    until each model no longer predicts the material to be stable. Again, materials
+    predicted more stable enter the precision and recall calculation sooner. Different
+    models predict different number of materials to be stable. Hence the curves end at
+    different points.
 
     Args:
         e_above_hull_true (pd.Series): Distance to convex hull according to DFT
             ground truth (in eV / atom).
-        e_above_hull_pred (pd.Series): Distance to convex hull predicted by model
-            (in eV / atom). Same as true energy to convex hull plus predicted minus true
-            formation energy.
-        metric ('precision' | 'recall', optional): Metric to plot.
-        stability_threshold (float, optional): Max distance from convex hull before
+        df_preds (pd.DataFrame): Distance to convex hull predicted by models, one column
+            per model (in eV / atom). Same as true energy to convex hull plus predicted
+            minus true formation energy.
+        stability_threshold (float, optional): Max distance above convex hull before
             material is considered unstable. Defaults to 0.
-        label (str, optional): Model name used to identify its liens in the legend.
-            Defaults to None.
-        project_end_point ('x' | 'y' | 'xy' | '', optional): Defaults to '', i.e. no
+        project_end_point ('x' | 'y' | 'xy' | '', optional): Whether to project end
+        points of precision and recall curves to the x/y axis. Defaults to '', i.e. no
             axis projection lines.
-        show_optimal (bool, optional): Whether to plot the optimal precision/recall
-            line. Defaults to False.
-        **kwargs: Keyword arguments passed to ax.plot().
+        show_optimal (bool, optional): Whether to plot the optimal recall line. Defaults
+            to False.
+        backend ('plotly' | 'matplotlib', optional): Defaults to 'plotly'. **kwargs:
+        Keyword arguments passed to df.plot().
 
     Returns:
-        plt.Axes: The matplotlib axes object.
+        tuple[plt.Figure | go.Figure, pd.DataFrame]: The matplotlib/plotly figure and
+            dataframe of cumulative metrics for each model.
     """
-    ax = ax or plt.gca()
+    fact = lambda: pd.DataFrame(index=range(len(e_above_hull_true)))
+    dfs = dict(Precision=fact(), Recall=fact())
 
-    e_above_hull_pred = e_above_hull_pred.sort_values()
-    e_above_hull_true = e_above_hull_true.loc[e_above_hull_pred.index]
+    for model_name in df_preds:
+        model_preds = df_preds[model_name].sort_values()
+        e_above_hull_true = e_above_hull_true.loc[model_preds.index]
 
-    true_pos, false_neg, false_pos, _true_neg = classify_stable(
-        e_above_hull_true, e_above_hull_pred, stability_threshold
-    )
+        true_pos, false_neg, false_pos, _true_neg = classify_stable(
+            e_above_hull_true, model_preds, stability_threshold
+        )
 
-    true_pos_cumsum = true_pos.cumsum()
+        true_pos_cumsum = true_pos.cumsum()
+        # precision aka positive predictive value (PPV)
+        precision = true_pos_cumsum / (true_pos_cumsum + false_pos.cumsum())
+        n_total_pos = sum(true_pos) + sum(false_neg)
+        recall = true_pos_cumsum / n_total_pos  # aka true_pos_rate aka sensitivity
 
-    # precision aka positive predictive value (PPV)
-    precision = true_pos_cumsum / (true_pos_cumsum + false_pos.cumsum()) * 100
-    n_true_pos = sum(true_pos)
-    n_false_neg = sum(false_neg)
-    n_total_pos = n_true_pos + n_false_neg
-    true_pos_rate = true_pos_cumsum / n_total_pos * 100
+        end = int(np.argmax(recall))
+        xs = np.arange(end)
 
-    end = int(np.argmax(true_pos_rate))
-    xs = np.arange(end)
+        prec_interp = scipy.interpolate.interp1d(xs, precision[:end], kind="cubic")
+        recall_interp = scipy.interpolate.interp1d(xs, recall[:end], kind="cubic")
+        dfs["Precision"][model_name] = pd.Series(prec_interp(xs))
+        dfs["Recall"][model_name] = pd.Series(recall_interp(xs))
 
-    ys_raw = dict(precision=precision, recall=true_pos_rate)[metric]
-    y_interp = scipy.interpolate.interp1d(xs, ys_raw[:end], kind="cubic")
-    ys = y_interp(xs)
+    for key, df in dfs.items():
+        # drop all-NaN rows so plotly plot x-axis only extends to largest number of
+        # predicted materials by any model
+        df.dropna(how="all", inplace=True)
+        df["metric"] = key
 
-    line_kwargs = dict(
-        linewidth=2, markevery=[-1], marker="x", markersize=14, markeredgewidth=2.5
-    )
-    ax.plot(xs, ys, **line_kwargs | kwargs)
-    ax.text(
-        xs[-1],
-        ys[-1],
-        label,
-        color=kwargs.get("color"),
-        verticalalignment="bottom",
-        rotation=30,
-        bbox=dict(facecolor="white", alpha=0.5, edgecolor="none"),
-    )
+    df_cum = pd.concat(dfs.values())
 
-    # add some visual guidelines
-    intersect_kwargs = dict(linestyle=":", alpha=0.4, color=kwargs.get("color"))
-    if "x" in project_end_point:
-        ax.plot((0, xs[-1]), (ys[-1], ys[-1]), **intersect_kwargs)
-    if "y" in project_end_point:
-        ax.plot((xs[-1], xs[-1]), (0, ys[-1]), **intersect_kwargs)
-
-    ax.set(ylim=(0, 100), ylabel=f"{metric.title()} (%)")
-
-    # optimal recall line finds all stable materials without any false positives
-    # can be included to confirm all models start out of with near optimal recall
-    # and to see how much each model overshoots total n_stable
-    n_below_hull = sum(e_above_hull_true < 0)
-    if show_optimal:
-        ax.plot(
-            [0, n_below_hull],
-            [0, 100],
-            color="green",
-            linestyle="dashed",
-            linewidth=1,
-            label=f"Optimal {metric.title()}",
+    if backend == "matplotlib":
+        fig, axs = plt.subplots(1, 2, figsize=(15, 7), sharey=True)
+        line_kwargs = dict(
+            linewidth=3, markevery=[-1], marker="x", markersize=14, markeredgewidth=2.5
         )
-        ax.text(
-            n_below_hull,
-            100,
-            label,
-            color=kwargs.get("color"),
-            verticalalignment="top",
-            rotation=-30,
-            bbox=dict(facecolor="white", alpha=0.5, edgecolor="none"),
+        for (key, df), ax in zip(dfs.items(), axs):
+            # select every n-th row of df so that 1000 rows are left for increased
+            # plotting speed and reduced file size
+            # falls back on every row if df has less than 1000 rows
+
+            df.iloc[:: len(df) // 1000 or 1].plot(
+                ax=ax, legend=False, backend=backend, **line_kwargs | kwargs, ylabel=key
+            )
+
+        # add some visual guidelines
+        intersect_kwargs = dict(linestyle=":", alpha=0.4, linewidth=2)
+        bbox = dict(facecolor="white", alpha=0.5, edgecolor="none")
+        assert len(axs) == len(dfs), f"{len(axs)} != {len(dfs)}"
+
+        for ax, df in zip(axs, dfs.values()):
+            ax.set(ylim=(0, 1), xlim=(0, None), ylabel=key)
+            for model in df_preds:
+                x_end = df[model].dropna().index[-1]
+                y_end = df[model].dropna().iloc[-1]
+                # place model name at the end of every line
+                ax.text(x_end, y_end, model, va="bottom", rotation=30, bbox=bbox)
+                if "x" in project_end_point:
+                    ax.plot((x_end, x_end), (0, y_end), **intersect_kwargs)
+                if "y" in project_end_point:
+                    ax.plot((0, x_end), (y_end, y_end), **intersect_kwargs)
+
+        # optimal recall line finds all stable materials without any false positives
+        # can be included to confirm all models start out of with near optimal recall
+        # and to see how much each model overshoots total n_stable
+        n_below_hull = sum(e_above_hull_true < 0)
+        if show_optimal:
+            opt_label = "Optimal Recall"
+            axs[1].plot([0, n_below_hull], [0, 1], color="green", linestyle="--")
+            axs[1].text(
+                *[n_below_hull, 0.81],
+                opt_label,
+                color="green",
+                va="bottom",
+                ha="right",
+                rotation=math.degrees(math.cos(math.atan(1 / n_below_hull))),
+                bbox=bbox,
+            )
+
+    elif backend == "plotly":
+        fig = df_cum.iloc[:: len(df_cum) // 1000 or 1].plot(
+            backend=backend, facet_col="metric", **kwargs
         )
+        fig.update_traces(line=dict(width=4))
+        for idx in range(1, 3):
+            fig.update_xaxes(
+                title_text="Number of materials predicted stable", row=1, col=idx
+            )
+        fig.update_yaxes(title="Precision", col=1)
+        fig.update_yaxes(title="Recall", col=2)
+        fig.for_each_annotation(lambda a: a.update(text=""))
+        fig.update_layout(legend=dict(title=""))
+        fig.update_layout(showlegend=False)
 
-    return ax
+    return fig, df_cum
 
 
 def wandb_scatter(table: wandb.Table, fields: dict[str, str], **kwargs: Any) -> None:
diff --git a/scripts/hist_classified_stable_vs_hull_dist.py b/scripts/hist_classified_stable_vs_hull_dist.py
@@ -60,9 +60,9 @@
 
 
 # %%
-fig_name = f"{ROOT}/figures/{today}-wren-wbm-hull-dist-hist-{which_energy=}.pdf"
+img_path = f"{ROOT}/figures/{today}-wren-wbm-hull-dist-hist-{which_energy=}.pdf"
 if hasattr(ax, "write_image"):
-    # fig.write_image(fig_name)
-    ax.write_html(fig_name.replace(".pdf", ".html"))
+    # fig.write_image(img_path)
+    ax.write_html(img_path.replace(".pdf", ".html"))
 else:
-    ax.figure.savefig(fig_name)
+    ax.figure.savefig(img_path)
diff --git a/scripts/hist_classified_stable_vs_hull_dist_models.py b/scripts/hist_classified_stable_vs_hull_dist_models.py
@@ -37,7 +37,7 @@
 
 backend: Backend = "matplotlib"
 if backend == "matplotlib":
-    fig, axs = plt.subplots(3, 3, figsize=(18, 12))
+    fig, axs = plt.subplots(nrows=3, ncols=3, figsize=(18, 12))
 else:
     fig = make_subplots(rows=3, cols=3)
 
@@ -75,9 +75,13 @@
         frameon=False,
     )
 
-fig
+fig.show()
 
 
 # %%
-img_path = f"{ROOT}/figures/{today}-wbm-hull-dist-hist-models.pdf"
-ax.figure.savefig(img_path)
+img_path = f"{ROOT}/figures/{today}-wbm-hull-dist-hist-models"
+# if hasattr(fig, "write_image"):
+#     fig.write_image(f"{img_path}.pdf")
+#     fig.write_html(f"{img_path}.html", include_ploltyjs="cdn")
+# else:
+#     fig.savefig(f"{img_path}.pdf")
diff --git a/scripts/precision_recall.py b/scripts/precision_recall.py
diff --git a/tests/test_plots.py b/tests/test_plots.py