add true_pred_hist to histograms.py

janosh · janosh · commit 0ea0ef65a171 · 2021-03-24T14:43:20.000+01:00
diff --git a/assets/true_pred_hist.svg b/assets/true_pred_hist.svg
diff --git a/mlmatrics/__init__.py b/mlmatrics/__init__.py
@@ -6,7 +6,7 @@
     ptable_elemental_prevalence,
     ptable_elemental_ratio,
 )
-from .histograms import residual_hist
+from .histograms import residual_hist, true_pred_hist
 from .metrics import regression_metrics
 from .parity import (
     density_hexbin,
diff --git a/mlmatrics/elements.py b/mlmatrics/elements.py
@@ -45,7 +45,7 @@ def ptable_elemental_prevalence(
     """Display the prevalence of each element in a materials dataset plotted as a
     heatmap over the periodic table. `formulas` xor `elem_counts` must be passed.
 
-    Adapted from https://github.com/kaaiian/ML_figures.
+    Adapted from https://github.com/kaaiian/ML_figures (https://git.io/JmbaI).
 
     Args:
         formulas (list[str]): compositional strings, e.g. ["Fe2O3", "Bi2Te3"]
@@ -240,7 +240,7 @@ def hist_elemental_prevalence(
 ) -> None:
     """Plots a histogram of the prevalence of each element in a materials dataset.
 
-    Adapted from https://github.com/kaaiian/ML_figures.
+    Adapted from https://github.com/kaaiian/ML_figures (https://git.io/JmbaI).
 
     Args:
         formulas (list): compositional strings, e.g. ["Fe2O3", "Bi2Te3"]
diff --git a/mlmatrics/histograms.py b/mlmatrics/histograms.py
@@ -1,19 +1,34 @@
 import matplotlib.pyplot as plt
 import numpy as np
 from matplotlib.axes import Axes
+from mpl_toolkits.axes_grid1.inset_locator import inset_axes
 from numpy import ndarray as Array
 from scipy.stats import gaussian_kde
 
 
 def residual_hist(
-    y_true: Array, y_pred: Array, ax: Axes = None, xlabel: str = None
+    y_true: Array, y_pred: Array, ax: Axes = None, xlabel: str = None, **kwargs
 ) -> Axes:
+    """Plot the residual distribution overlayed with a Gaussian kernel
+    density estimate.
+
+    Adapted from https://github.com/kaaiian/ML_figures (https://git.io/Jmb2O).
+
+    Args:
+        y_true (Array): ground truth targets
+        y_pred (Array): model predictions
+        ax (Axes, optional): plt axes. Defaults to None.
+        xlabel (str, optional): x-axis label. Defaults to None.
+
+    Returns:
+        Axes: plt axes with plotted data.
+    """
 
     if ax is None:
         ax = plt.gca()
 
     y_res = y_pred - y_true
-    plt.hist(y_res, bins=35, density=True, edgecolor="black")
+    plt.hist(y_res, bins=35, density=True, edgecolor="black", **kwargs)
 
     # Gaussian kernel density estimation: evaluates the Gaussian
     # probability density estimated based on the points in y_res
@@ -27,3 +42,69 @@ def residual_hist(
     plt.legend(loc=2, framealpha=0.5, handlelength=1)
 
     return ax
+
+
+def true_pred_hist(
+    y_true: Array,
+    y_pred: Array,
+    y_std: Array,
+    ax: Axes = None,
+    cmap: str = "hot",
+    bins: int = 50,
+    log: bool = True,
+    truth_color: str = "blue",
+    **kwargs,
+) -> Axes:
+    """Plot a histogram of model predictions with bars colored by the average uncertainty of
+    predictions in that bin. Overlayed by a more transparent histogram of ground truth values.
+
+    Args:
+        y_true (Array): ground truth targets
+        y_pred (Array): model predictions
+        y_std (Array): model uncertainty
+        ax (Axes, optional): plt axes. Defaults to None.
+        cmap (str, optional): string identifier of a plt colormap. Defaults to "hot".
+        bins (int, optional): Histogram resolution. Defaults to 50.
+        log (bool, optional): Whether to log-scale the y-axis. Defaults to True.
+        truth_color (str, optional): Face color to use for y_true bars. Defaults to "blue".
+
+    Returns:
+        Axes: plt axes with plotted data.
+    """
+
+    if ax is None:
+        ax = plt.gca()
+
+    cmap = getattr(plt.cm, cmap)
+    y_true, y_pred, y_std = np.array([y_true, y_pred, y_std])
+
+    _, bins, bars = ax.hist(
+        y_pred, bins=bins, alpha=0.8, label=r"$y_\mathrm{pred}$", **kwargs
+    )
+    ax.hist(
+        y_true,
+        bins=bins,
+        alpha=0.2,
+        color=truth_color,
+        label=r"$y_\mathrm{true}$",
+        **kwargs,
+    )
+
+    for xmin, xmax, rect in zip(bins, bins[1:], bars.patches):
+
+        y_preds_in_rect = np.logical_and(y_pred > xmin, y_pred < xmax).nonzero()
+
+        color_value = y_std[y_preds_in_rect].mean()
+
+        rect.set_color(cmap(color_value))
+
+    if log:
+        plt.yscale("log")
+    ax.legend(frameon=False)
+    cb_ax = inset_axes(ax, width="3%", height="50%", loc="center right")
+
+    norm = plt.cm.colors.Normalize(vmax=y_std.max(), vmin=y_std.min())
+    plt.colorbar(plt.cm.ScalarMappable(norm=norm, cmap=cmap), cax=cb_ax)
+    cb_ax.yaxis.set_ticks_position("left")
+
+    return ax
diff --git a/mlmatrics/parity.py b/mlmatrics/parity.py
@@ -90,7 +90,7 @@ def density_scatter(
             Defaults to True.
 
     Returns:
-        Axes: plt axes containing the plot.
+        Axes: plt axes with plotted data.
     """
     if ax is None:
         ax = plt.gca()
@@ -137,7 +137,7 @@ def scatter_with_err_bar(
         title (str, optional): Plot tile. Defaults to None.
 
     Returns:
-        Axes: plt axes on which the data was plotted.
+        Axes: plt axes with plotted data.
     """
     if ax is None:
         ax = plt.gca()
@@ -169,7 +169,7 @@ def density_hexbin(
 
     # the scatter plot
     hexbin = ax.hexbin(targets, preds, gridsize=75, mincnt=1, bins="log", C=color_map)
-    cb_ax = inset_axes(ax, width="3%", height="70%", loc=4)
+    cb_ax = inset_axes(ax, width="3%", height="70%", loc="lower right")
     plt.colorbar(hexbin, cax=cb_ax)
     cb_ax.yaxis.set_ticks_position("left")
 
diff --git a/readme.md b/readme.md
@@ -32,7 +32,7 @@ numpy==1.20.1
 git+git://github.com/janosh/mlmatrics
 ```
 
-To specify a certain branch or commit, append it's name or hash, e.g.
+To specify a specific branch or commit, append its name or hash, e.g.
 
 ```txt
 git+git://github.com/janosh/mlmatrics@master # default
@@ -107,9 +107,9 @@ See [`mlmatrics/correlation.py`](mlmatrics/correlation.py).
 
 See [`mlmatrics/histograms.py`](mlmatrics/histograms.py).
 
-| [`residual_hist(y_true, y_pred)`](mlmatrics/histograms.py) |       |
-| :--------------------------------------------------------: | :---: |
-|         ![residual_hist](assets/residual_hist.svg)         |       |
+| [`residual_hist(y_true, y_pred)`](mlmatrics/histograms.py) | [`true_pred_hist(y_true, y_pred, y_std)`](mlmatrics/histograms.py) |
+| :--------------------------------------------------------: | :----------------------------------------------------------------: |
+|         ![residual_hist](assets/residual_hist.svg)         |            ![true_pred_hist](assets/true_pred_hist.svg)            |
 
 ## Adding Assets
 
@@ -130,7 +130,7 @@ python -m pytest tests/test_cumulative.py
 python -m pytest **/test_*_metrics.py
 ```
 
-You can also run single tests by passing its name to the `-k` flag:
+To run a single test, pass its name to the `-k` flag:
 
 ```sh
 python -m pytest -k test_precision_recall_curve
@@ -140,6 +140,6 @@ Consult the [`pytest`](https://docs.pytest.org/en/stable/usage.html) docs for mo
 
 ## Glossary
 
-1. **Residual** `y - y_hat`: The difference between ground truth target and model prediction.
-2. **Error** `abs(y - y_hat)`: Absolute error between target and model prediction.
-3. **Uncertainty** `y_std`: The model's estimate for its own error, i.e. how much the model thinks its prediction can be trusted. (`std` for standard deviation.)
+1. **Residual** `y_res = y_true - y_pred`: The difference between ground truth target and model prediction.
+2. **Error** `y_err = abs(y_true - y_pred)`: Absolute error between target and model prediction.
+3. **Uncertainty** `y_std`: The model's estimate for its error, i.e. how much the model thinks its prediction can be trusted. (`std` for standard deviation.)
diff --git a/scripts/plot_all.py b/scripts/plot_all.py
@@ -22,6 +22,7 @@
     residual_vs_actual,
     roc_curve,
     scatter_with_err_bar,
+    true_pred_hist,
 )
 
 plt.rcParams.update({"font.size": 20})
@@ -158,6 +159,9 @@ def savefig(filename: str) -> None:
 residual_hist(y_true, y_pred)
 savefig("residual_hist")
 
+true_pred_hist(y_true, y_pred, y_std)
+savefig("true_pred_hist")
+
 
 # %% Correlation Plots
 rand_wide_mat = pd.read_csv(f"{ROOT}/data/rand_wide_matrix.csv", header=None).to_numpy()
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
     version="0.0.1",
     author="Janosh Riebesell",
     author_email="janosh.riebesell@gmail.com",
-    description="A collection of plots useful in data-driven research of materials",
+    description="A collection of plots useful in data-driven materials science",
     long_description=open("readme.md").read(),
     long_description_content_type="text/markdown",
     url="https://github.com/janosh/mlmatrics",
diff --git a/tests/test_histograms.py b/tests/test_histograms.py
@@ -1,7 +1,11 @@
-from mlmatrics import residual_hist
+from mlmatrics import residual_hist, true_pred_hist
 
 from . import y_pred, y_true
 
 
 def test_residual_hist():
     residual_hist(y_true, y_pred)
+
+
+def test_true_pred_hist():
+    true_pred_hist(y_true, y_pred, y_true - y_pred)

Original file line number	Diff line number	Diff line change
`@@ -6,7 +6,7 @@`
`6`	`6`	`ptable_elemental_prevalence,`
`7`	`7`	`ptable_elemental_ratio,`
`8`	`8`	`)`
`9`		`-from .histograms import residual_hist`
	`9`	`+from .histograms import residual_hist, true_pred_hist`
`10`	`10`	`from .metrics import regression_metrics`
`11`	`11`	`from .parity import (`
`12`	`12`	`density_hexbin,`