remove ml_matrics.utils.add_identity, use plt.axline instead https://git.io/JERaj

janosh · janosh · commit d30a29fce537 · 2021-08-26T11:56:39.000+01:00
diff --git a/assets/residual_vs_actual.svg b/assets/residual_vs_actual.svg
diff --git a/ml_matrics/__init__.py b/ml_matrics/__init__.py
@@ -19,4 +19,4 @@
 from .quantile import qq_gaussian
 from .ranking import err_decay
 from .relevance import precision_recall_curve, roc_curve
-from .utils import ROOT, add_identity, annotate_bar_heights
+from .utils import ROOT, annotate_bar_heights
diff --git a/ml_matrics/cumulative.py b/ml_matrics/cumulative.py
@@ -24,8 +24,8 @@ def cum_res(preds: NumArray, targets: NumArray, ax: Axes = None) -> None:
     """Plot the empirical cumulative distribution for the residuals (y - mu).
 
     Args:
-        preds (NumArray): Numpy array of predictions.
-        targets (NumArray): Numpy array of targets.
+        preds (array): Numpy array of predictions.
+        targets (array): Numpy array of targets.
         ax (Axes, optional): plt.Axes object. Defaults to None.
     """
     if ax is None:
@@ -65,8 +65,8 @@ def cum_err(preds: NumArray, targets: NumArray, ax: Axes = None) -> None:
     """Plot the empirical cumulative distribution for the absolute errors abs(y - y_hat).
 
     Args:
-        preds (NumArray): Numpy array of predictions.
-        targets (NumArray): Numpy array of targets.
+        preds (array): Numpy array of predictions.
+        targets (array): Numpy array of targets.
         ax (Axes, optional): plt.Axes object. Defaults to None.
     """
     if ax is None:
diff --git a/ml_matrics/histograms.py b/ml_matrics/histograms.py
@@ -24,8 +24,8 @@ def residual_hist(
     Adapted from https://github.com/kaaiian/ML_figures (https://git.io/Jmb2O).
 
     Args:
-        y_true (NumArray): ground truth targets
-        y_pred (NumArray): model predictions
+        y_true (array): ground truth targets
+        y_pred (array): model predictions
         ax (Axes, optional): plt.Axes object. Defaults to None.
         xlabel (str, optional): x-axis label. Defaults to None.
 
@@ -68,9 +68,9 @@ def true_pred_hist(
     predictions in that bin. Overlayed by a more transparent histogram of ground truth values.
 
     Args:
-        y_true (NumArray): ground truth targets
-        y_pred (NumArray): model predictions
-        y_std (NumArray): model uncertainty
+        y_true (array): ground truth targets
+        y_pred (array): model predictions
+        y_std (array): model uncertainty
         ax (Axes, optional): plt.Axes object. Defaults to None.
         cmap (str, optional): string identifier of a plt colormap. Defaults to "hot".
         bins (int, optional): Histogram resolution. Defaults to 50.
@@ -129,7 +129,7 @@ def spacegroup_hist(spacegroups: NumArray, ax: Axes = None, **kwargs: Any) -> Ax
     (triclinic, monoclinic, orthorhombic, tetragonal, trigonal, hexagonal, cubic)
 
     Args:
-        spacegroups (NumArray): A list of spacegroup numbers.
+        spacegroups (array): A list of spacegroup numbers.
         ax (Axes, optional): plt.Axes object. Defaults to None.
         kwargs: Keywords passed to pd.Series.plot.bar().
 
diff --git a/ml_matrics/metrics.py b/ml_matrics/metrics.py
@@ -19,8 +19,8 @@ def regression_metrics(
     TODO make robust by finding the common axis
 
     Args:
-        y_true (NumArray): Regression targets.
-        y_preds (NumArray): Model predictions.
+        y_true (array): Regression targets.
+        y_preds (array): Model predictions.
         verbose (bool, optional): Whether to print metrics. Defaults to False.
 
     Returns:
@@ -107,8 +107,8 @@ def classification_metrics(
     to multi-task automatically?
 
     Args:
-        target (NumArray): categorical encoding of the tasks
-        logits (NumArray): logits predicted by the model
+        target (array): categorical encoding of the tasks
+        logits (array): logits predicted by the model
         verbose (bool, optional): Whether to print metrics. Defaults to False.
     """
 
diff --git a/ml_matrics/parity.py b/ml_matrics/parity.py
@@ -10,7 +10,7 @@
 from scipy.interpolate import interpn
 from sklearn.metrics import r2_score
 
-from ml_matrics.utils import NumArray, add_identity, with_hist
+from ml_matrics.utils import NumArray, with_hist
 
 
 def hist_density(
@@ -19,8 +19,8 @@ def hist_density(
     """Return an approximate density of 2d points.
 
     Args:
-        xs (NumArray): x-coordinates of points
-        ys (NumArray): y-coordinates of points
+        xs (array): x-coordinates of points
+        ys (array): y-coordinates of points
         sort (bool, optional): Whether to sort points by density so that densest points
             are plotted last. Defaults to True.
         bins (int, optional): Number of bins (histogram resolution). Defaults to 100.
@@ -76,8 +76,8 @@ def density_scatter(
     """Scatter plot colored (and optionally sorted) by density.
 
     Args:
-        xs (NumArray): x values.
-        ys (NumArray): y values.
+        xs (array): x values.
+        ys (array): y values.
         ax (Axes, optional): plt.Axes object. Defaults to None.
         color_map (str, optional): plt color map or valid string name. Defaults to "Blues".
         sort (bool, optional): Whether to sort the data. Defaults to True.
@@ -102,8 +102,12 @@ def density_scatter(
     norm = mpl.colors.LogNorm() if log else None
 
     ax.scatter(xs, ys, c=cs, cmap=color_map, norm=norm, **kwargs)
+
     if identity:
-        add_identity(ax, label="ideal")
+        ax.axline(
+            (0, 0), (1, 1), alpha=0.5, zorder=0, linestyle="dashed", color="black"
+        )
+
     if stats:
         add_mae_r2_box(xs, ys, ax)
 
@@ -128,10 +132,10 @@ def scatter_with_err_bar(
     i.e. if points farther from the parity line have larger uncertainty.
 
     Args:
-        xs (NumArray): x-values
-        ys (NumArray): y-values
-        xerr (NumArray, optional): Horizontal error bars. Defaults to None.
-        yerr (NumArray, optional): Vertical error bars. Defaults to None.
+        xs (array): x-values
+        ys (array): y-values
+        xerr (array, optional): Horizontal error bars. Defaults to None.
+        yerr (array, optional): Vertical error bars. Defaults to None.
         ax (Axes, optional): plt.Axes object. Defaults to None.
         xlabel (str, optional): x-axis label. Defaults to "Actual".
         ylabel (str, optional): y-axis label. Defaults to "Predicted".
@@ -145,7 +149,10 @@ def scatter_with_err_bar(
 
     styles = dict(markersize=6, fmt="o", ecolor="g", capthick=2, elinewidth=2)
     ax.errorbar(xs, ys, yerr=yerr, xerr=xerr, **kwargs, **styles)
-    add_identity(ax)
+
+    # identity line
+    ax.axline((0, 0), (1, 1), alpha=0.5, zorder=0, linestyle="dashed", color="black")
+
     add_mae_r2_box(xs, ys, ax)
 
     ax.set(xlabel=xlabel, ylabel=ylabel, title=title)
@@ -166,10 +173,10 @@ def density_hexbin(
     dimension passed as weights.
 
     Args:
-        xs (NumArray): x values
-        yx (NumArray): y values
+        xs (array): x values
+        yx (array): y values
         ax (Axes, optional): plt.Axes object. Defaults to None.
-        weights (NumArray, optional): If given, these values are accumulated in the bins.
+        weights (array, optional): If given, these values are accumulated in the bins.
             Otherwise, every point has value 1. Must be of the same length as x and y.
             Defaults to None.
         xlabel (str, optional): x-axis label. Defaults to "Actual".
@@ -188,7 +195,9 @@ def density_hexbin(
     plt.colorbar(hexbin, cax=cb_ax)
     cb_ax.yaxis.set_ticks_position("left")
 
-    add_identity(ax, label="ideal")
+    # identity line
+    ax.axline((0, 0), (1, 1), alpha=0.5, zorder=0, linestyle="dashed", color="black")
+
     add_mae_r2_box(xs, yx, ax, loc="upper left")
 
     ax.set(xlabel=xlabel, ylabel=ylabel)
@@ -235,8 +244,8 @@ def residual_vs_actual(y_true: NumArray, y_pred: NumArray, ax: Axes = None) -> A
     (y_err = y_true - y_pred) on the y-axis.
 
     Args:
-        y_true (NumArray): Ground truth values
-        y_pred (NumArray): Model predictions
+        y_true (array): Ground truth values
+        y_pred (array): Model predictions
         ax (Axes, optional): plt.Axes object. Defaults to None.
 
     Returns:
@@ -248,11 +257,10 @@ def residual_vs_actual(y_true: NumArray, y_pred: NumArray, ax: Axes = None) -> A
 
     y_err = y_true - y_pred
 
-    xmin = np.min(y_true) * 0.9
-    xmax = np.max(y_true) / 0.9
-
     plt.plot(y_true, y_err, "o", alpha=0.5, label=None, mew=1.2, ms=5.2)
-    plt.plot([xmin, xmax], [0, 0], "k--", alpha=0.5, label="ideal")
+    plt.axline(
+        [1, 0], [2, 0], linestyle="dashed", color="black", alpha=0.5, label="ideal"
+    )
 
     plt.ylabel(r"Residual ($y_\mathrm{test} - y_\mathrm{pred}$)")
     plt.xlabel("Actual value")
diff --git a/ml_matrics/quantile.py b/ml_matrics/quantile.py
@@ -2,15 +2,19 @@
 
 import matplotlib.pyplot as plt
 import numpy as np
+from matplotlib.axes import Axes
 from scipy.stats import norm
 
-from ml_matrics.utils import NumArray, add_identity
+from ml_matrics.utils import NumArray
 
 
 def qq_gaussian(
-    y_true: NumArray, y_pred: NumArray, y_std: Union[NumArray, Dict[str, NumArray]]
+    y_true: NumArray,
+    y_pred: NumArray,
+    y_std: Union[NumArray, Dict[str, NumArray]],
+    ax: Axes = None,
 ) -> None:
-    """Plot the Gaussian quantile-quantile (Q-Q) plot of one (passed as NumArray)
+    """Plot the Gaussian quantile-quantile (Q-Q) plot of one (passed as array)
     or multiple (passed as dict) sets of uncertainty estimates for a single
     pair of ground truth targets `y_true` and model predictions `y_pred`.
 
@@ -25,10 +29,13 @@ def qq_gaussian(
     Info on Q-Q plots: https://wikipedia.org/wiki/Q-Q_plot
 
     Args:
-        y_true (NumArray): ground truth targets
-        y_pred (NumArray): model predictions
-        y_std (NumArray | dict): model uncertainties
+        y_true (array): ground truth targets
+        y_pred (array): model predictions
+        y_std (array | dict[str, array]): model uncertainties
     """
+    if ax is None:
+        ax = plt.gca()
+
     if isinstance(y_std, np.ndarray):
         y_std = {"std": y_std}
 
@@ -38,35 +45,33 @@ def qq_gaussian(
     lines = []  # collect plotted lines to show second legend with miscalibration areas
     for key, std in y_std.items():
 
-        z_scored = (res / std).reshape(-1, 1)
+        z_scored = (np.array(res) / std).reshape(-1, 1)
 
         exp_proportions = np.linspace(0, 1, resolution)
         gaussian_upper_bound = norm.ppf(0.5 + exp_proportions / 2)
         obs_proportions = np.mean(z_scored <= gaussian_upper_bound, axis=0)
 
-        [line] = plt.plot(
+        [line] = ax.plot(
             exp_proportions, obs_proportions, linewidth=2, alpha=0.8, label=key
         )
-        plt.fill_between(
+        ax.fill_between(
             exp_proportions, y1=obs_proportions, y2=exp_proportions, alpha=0.2
         )
         miscal_area = np.trapz(
             np.abs(obs_proportions - exp_proportions), dx=1 / resolution
         )
         lines.append([line, miscal_area])
 
-    add_identity(label="ideal")
-
-    plt.xlim(0, 1)
-    plt.ylim(0, 1)
+    # identity line
+    ax.axline((0, 0), (1, 1), alpha=0.5, zorder=0, linestyle="dashed", color="black")
 
-    plt.xlabel("Theoretical Quantile")
-    plt.ylabel("Observed Quantile")
+    ax.set(xlim=(0, 1), ylim=(0, 1))
+    ax.set(xlabel="Theoretical Quantile", ylabel="Observed Quantile")
 
     legend1 = plt.legend(loc="upper left", frameon=False)
     # Multiple legends on the same axes:
     # https://matplotlib.org/3.3.3/tutorials/intermediate/legend_guide.html#multiple-legends-on-the-same-axes
-    plt.gca().add_artist(legend1)
+    ax.add_artist(legend1)
 
     lines, areas = zip(*lines)
 
diff --git a/ml_matrics/ranking.py b/ml_matrics/ranking.py
@@ -54,9 +54,9 @@ def err_decay(
     similarly to how it decays when removing the predictions of largest error.
 
     Args:
-        y_true (NumArray): Ground truth regression targets.
-        y_pred (NumArray): Model predictions.
-        y_stds (NumArray | dict[str, NumArray]): Model uncertainties. Can be a single or
+        y_true (array): Ground truth regression targets.
+        y_pred (array): Model predictions.
+        y_stds (array | dict[str, NumArray]): Model uncertainties. Can be a single or
             multiple types (e.g. aleatoric/epistemic/total uncertainty) in dict form.
         title (str, optional): Plot title. Defaults to None.
         n_rand (int, optional): Number of shuffles from which to compute std.dev.
diff --git a/ml_matrics/relevance.py b/ml_matrics/relevance.py
@@ -15,8 +15,8 @@ def roc_curve(
     the positive class.
 
     Args:
-        targets (NumArray): Ground truth targets.
-        proba_pos (NumArray): predicted probabilities for the positive class.
+        targets (array): Ground truth targets.
+        proba_pos (array): predicted probabilities for the positive class.
 
     Returns:
         float: The classifier's ROC area under the curve.
@@ -44,8 +44,8 @@ def precision_recall_curve(
     """Plot the precision recall curve of a binary classifier.
 
     Args:
-        targets (NumArray): Ground truth targets.
-        proba_pos (NumArray): predicted probabilities for the positive class.
+        targets (array): Ground truth targets.
+        proba_pos (array): predicted probabilities for the positive class.
 
     Returns:
         float: The classifier's precision score.
diff --git a/ml_matrics/utils.py b/ml_matrics/utils.py
@@ -1,5 +1,5 @@
 from os.path import abspath, dirname
-from typing import Any, Sequence, Union
+from typing import Sequence, Union
 
 import matplotlib.pyplot as plt
 import numpy as np
@@ -14,29 +14,6 @@
 NumArray = NDArray[Union[np.float64, np.int_]]
 
 
-def add_identity(ax: Axes = None, **line_kwargs: Any) -> None:
-    """Add a parity line (y = x) to the provided axis."""
-    if ax is None:
-        ax = plt.gca()
-
-    # zorder=0 ensures other plotted data displays on top of line
-    default_kwargs = dict(alpha=0.5, zorder=0, linestyle="dashed", color="black")
-    (identity,) = ax.plot([], [], **default_kwargs, **line_kwargs)
-
-    def callback(axes: Axes) -> None:
-        x_min, x_max = axes.get_xlim()
-        y_min, y_max = axes.get_ylim()
-        low = max(x_min, y_min)
-        high = min(x_max, y_max)
-        identity.set_data([low, high], [low, high])
-
-    callback(ax)
-    # Register callbacks to update identity line when moving plots in interactive
-    # mode to ensure line always extend to plot edges.
-    ax.callbacks.connect("xlim_changed", callback)
-    ax.callbacks.connect("ylim_changed", callback)
-
-
 def with_hist(
     xs: NumArray, ys: NumArray, cell: GridSpec = None, bins: int = 100  # type: ignore
 ) -> Axes:
@@ -46,8 +23,8 @@ def with_hist(
     above and near the right edge.
 
     Args:
-        xs (NumArray): x values.
-        ys (NumArray): y values.
+        xs (array): x values.
+        ys (array): y values.
         cell (GridSpec, optional): Cell of a plt GridSpec at which to add the
             grid of plots. Defaults to None.
         bins (int, optional): Resolution/bin count of the histograms. Defaults to 100.
diff --git a/scripts/plot_all.py b/scripts/plot_all.py