ALE custom grid points update. (#731)

RobertSamoilescu · web-flow · commit 5db36d96a25e · 2022-08-05T09:44:12.000+01:00
* Loosen lower/upper bound conditions to allow custom grid points instead of clamping to min/max value. Included test.

* Updated docs for min_bin_points.

* Minor grammar correction.
diff --git a/alibi/explainers/ale.py b/alibi/explainers/ale.py
@@ -111,7 +111,8 @@ def explain(self,
             Features for which to calculate ALE.
         min_bin_points
             Minimum number of points each discretized interval should contain to ensure more precise
-            ALE estimation.
+            ALE estimation. Only relevant for adaptive grid points (i.e., features without an entry in the
+            `grid_points` dictionary).
         grid_points
             Custom grid points. Must be a `dict` where the keys are features indices and the values are
             monotonically increasing `numpy` arrays defining the grid points for each feature.
@@ -138,8 +139,8 @@ def explain(self,
 
          - Grid points outside the feature range. Consider the following example: `O O O X X O X O X O O`, \
         where 3 grid-points are smaller than the minimum value in `f`, and 2 grid-points are larger than the maximum \
-        value in `f`. Grid-points outside the feature value range are clipped between the minimum and maximum \
-        values of `f`. The grid-points considered will be: `(O|X) X O X O (X|O)`.
+        value in `f`. The empty leading and ending bins are removed. The grid-points considered
+        will be: `O X X O X O X O`.
 
          - Grid points that do not cover the entire feature range. Consider the following example: \
         `X X O X X O X O X X X X X`. Two auxiliary grid-points are added which correspond the value of the minimum \
@@ -408,7 +409,7 @@ def ale_num(
         Custom grid points. An `numpy` array defining the grid points for the given features.
     min_bin_points
         Minimum number of points each discretized interval should contain to ensure more precise
-        ALE estimation.
+        ALE estimation. Only relevant for adaptive grid points (i.e., feature for which ``feature_grid_points=None``).
     check_feature_resolution
         Refer to :class:`ALE` documentation.
     low_resolution_threshold
@@ -445,12 +446,24 @@ def ale_num(
         fvals = np.sort(feature_grid_points)
 
         if min_val > fvals[0]:
-            logger.warning(f'Feature {feature} grid-points contain lower values than the minimum feature value. '
-                           'Automatically lower bound clipping the grid-points values.')
+            # select the greatest grid point that is less or equal to the minimum feature value
+            min_idx = np.where(fvals <= min_val)[0][-1]
+            min_val = fvals[min_idx]
+
+            if min_idx != 0:
+                logger.warning(f'The leading bins of feature {feature} defined by the grid-points do not contain '
+                               'any feature values. Automatically removing the empty leading bins to ensure that '
+                               'each bin contains at least one feature value.')
 
         if max_val < fvals[-1]:
-            logger.warning(f'Feature {feature} grid-points contain larger values than the maximum feature value. '
-                           'Automatically upper bound clipping the grid-points values.')
+            # select the smallest grid point that is larger or equal to the maximum feature value
+            max_idx = np.where(fvals >= max_val)[0][0]
+            max_val = fvals[max_idx]
+
+            if max_idx != len(fvals) - 1:
+                logger.warning(f'The ending bins of feature {feature} defined by the grid-points do not contain '
+                               'any feature values. Automatically removing the empty ending bins to ensure that '
+                               'each bin contains at least one feature value.')
 
         # clip the values and remove duplicates
         fvals = np.unique(np.clip(fvals, a_min=min_val, a_max=max_val))
@@ -469,13 +482,17 @@ def ale_num(
 
         # check how many feature values are in each bin
         indices = np.searchsorted(fvals, X[:, feature], side="left")
-        interval_n = np.bincount(indices)  # number of points in each interval
+        # put the smallest data point in the first interval
+        indices[indices == 0] = 1
+        # count the number of points in each interval without considering the first bin,
+        # because the first bin will contain always 0 (see line above)
+        interval_n = np.bincount(indices)[1:]
 
         if np.any(interval_n == 0):
-            fvals = np.delete(fvals, np.where(interval_n == 0)[0])
+            fvals = np.delete(fvals, np.where(interval_n == 0)[0] + 1)  # +1 because we don't consider the first bin
             logger.warning(f'Some bins of feature {feature} defined by the grid-points do not contain '
                            'any feature values. Automatically merging consecutive bins to ensure that '
-                           'each bin contains at least one value.')
+                           'each bin contains at least one feature value.')
 
     # if the feature is constant, calculate the ALE on a small interval surrounding the feature value
     if len(fvals) == 1:
diff --git a/alibi/explainers/tests/test_ale.py b/alibi/explainers/tests/test_ale.py
@@ -137,18 +137,15 @@ def test_explain(mock_ale_explainer, features, input_dim, batch_size, custom_gri
 @pytest.mark.parametrize('extrapolate_constant_min', (0.1, 1.0))
 @pytest.mark.parametrize('constant_value', (5.,))
 @pytest.mark.parametrize('feature', (1,))
-@pytest.mark.parametrize('custom_grid', (True, False))
 def test_constant_feature(extrapolate_constant, extrapolate_constant_perc, extrapolate_constant_min,
-                          constant_value, feature, custom_grid):
+                          constant_value, feature):
     X = np.random.normal(size=(100, 2))
     X[:, feature] = constant_value
     predict = lambda x: x.sum(axis=1)  # dummy predictor # noqa
-    feature_grid_points = np.random.normal((1, )) if custom_grid else None
 
     q, ale, ale0 = ale_num(predictor=predict,
                            X=X,
                            feature=feature,
-                           feature_grid_points=feature_grid_points,
                            extrapolate_constant=extrapolate_constant,
                            extrapolate_constant_perc=extrapolate_constant_perc,
                            extrapolate_constant_min=extrapolate_constant_min)
@@ -159,3 +156,59 @@ def test_constant_feature(extrapolate_constant, extrapolate_constant_perc, extra
         assert_allclose(q, np.array([constant_value]))
         assert_allclose(ale, np.array([[0.]]))
         assert_allclose(ale0, np.array([0.]))
+
+
+@pytest.mark.parametrize('num_bins', [1, 3, 5, 7, 15])
+@pytest.mark.parametrize('perc_bins', [0.1, 0.2, 0.5, 0.7, 0.9, 1.0])
+@pytest.mark.parametrize('size_data', [1, 5, 10, 50, 100])
+@pytest.mark.parametrize('outside_grid', [False, True])
+def test_grid_points_stress(num_bins, perc_bins, size_data, outside_grid):
+    np.random.seed(0)
+    eps = 1
+
+    # define the grid between [-10, 10] having `num_bins` bins
+    grid = np.unique(np.random.uniform(-10, 10, size=num_bins + 1))
+
+    # select specific bins to sample the data from grid defined above.
+    # the number of bins is controlled by the percentage of bins given by `perc_bins`
+    nbins = int(np.ceil(num_bins * perc_bins))
+    bins = np.sort(np.random.choice(num_bins, size=nbins, replace=False))
+
+    # generate data
+    X = []
+    selected_bins = []
+
+    for i in range(size_data):
+        # select a bin at random and mark it as selected
+        bin = np.random.choice(bins, size=1)
+        selected_bins.append(bin.item())
+
+        # define offset to ensure that the value is sampled within the bin
+        # (i.e. avoid edge cases where  the data might land on the grid point)
+        # the ALE implementation should work even in that case, only the process of constructing
+        # the expected values might require additional logic
+        offset = 0.1 * (grid[bin + 1] - grid[bin])
+        X.append(np.random.uniform(low=grid[bin] + offset, high=grid[bin + 1] - offset).item())
+
+    # add values outside the grid to test that the grid is extended
+    if outside_grid:
+        X = X + [grid[0] - eps, grid[-1] + eps]
+
+    # construct dataset, define dummy predictor, and get grid values used by ale
+    X = np.array(X).reshape(-1, 1)
+    predict = lambda x: x.sum(axis=1)  # noqa
+    q, _, _ = ale_num(predictor=predict, X=X, feature=0, feature_grid_points=grid)
+
+    # construct expected grid by merging selected bins
+    if outside_grid:
+        # add first and last bin corresponding to min and max value.
+        # This requires incrementing all the previous values by 1
+        selected_bins = np.array(selected_bins + [-1, num_bins]) + 1
+
+        # update grid point to include the min and max
+        grid = np.insert(grid, 0, grid[0] - eps)
+        grid = np.insert(grid, len(grid), grid[-1] + eps)
+
+    selected_bins = np.unique(selected_bins)
+    expected_q = np.array([grid[selected_bins[0]]] + [grid[b + 1] for b in selected_bins])
+    np.testing.assert_allclose(q, expected_q)