Rewrite History to not use any recursion. (skorch-dev#312)

benjamin-work · ottonemo · commit c50f3bb29571 · 2018-09-26T13:33:48.000+02:00
* Rewrite History to not use any recursion.

Instead unroll the successive indexing steps and perform them
backwards, i.e. starting with batches and following with epochs.

* Raise a KeyError when history indexing deeper than 4.

* Add a benchmark script to test History.

* Address comments by ottonemo.

* clarifying comments
* deprecation warning
diff --git a/examples/benchmarks/history.py b/examples/benchmarks/history.py
@@ -0,0 +1,145 @@
+"""Benchmark to test time and memory performance of History.
+
+Before #312, the timing would be roughly 5 sec and memory usage would
+triple. After #312, the timing would be roughly 2 sec and memory usage
+roughly constant.
+
+For the reasons, see #306.
+
+"""
+
+from pprint import pprint
+import time
+
+import numpy as np
+from sklearn.datasets import make_classification
+import torch
+
+from skorch import NeuralNetClassifier
+from skorch.callbacks import Callback
+from skorch.toy import make_classifier
+
+
+side_effects = []
+
+
+class TriggerKeyError(Callback):
+    def on_batch_end(self, net, **kwargs):
+        try:
+            net.history[-1, 'batches', -1, 'foobar']
+        except Exception as e:
+            pass
+
+
+class PrintMemory(Callback):
+    def on_batch_end(self, net, **kwargs):
+        side_effects.append((
+            torch.cuda.memory_allocated() / 1e6,
+            torch.cuda.memory_cached() / 1e6
+        ))
+
+
+def train():
+    X, y = make_classification(1000, 20, n_informative=10, random_state=0)
+    X = X.astype(np.float32)
+    y = y.astype(np.int64)
+
+    module = make_classifier(input_units=20)
+
+    net = NeuralNetClassifier(
+        module,
+        max_epochs=10,
+        lr=0.1,
+        callbacks=[TriggerKeyError(), PrintMemory()],
+        device='cuda',
+    )
+
+    return net.fit(X, y)
+
+
+def safe_slice(history, keys):
+    # catch errors
+    for key in keys:
+        try:
+            history[key]
+        except (KeyError, IndexError):
+            pass
+
+
+def performance_history(history):
+    # SUCCESSFUL
+    # level 0
+    for i in range(len(history)):
+        history[i]
+
+    # level 1
+    keys = tuple(history[0].keys())
+    history[0, keys]
+    history[:, keys]
+    for key in keys:
+        history[0, key]
+        history[:, key]
+
+    # level 2
+    for i in range(len(history[0, 'batches'])):
+        history[0, 'batches', i]
+        history[:, 'batches', i]
+    history[:, 'batches', :]
+
+    # level 3
+    keys = tuple(history[0, 'batches', 0].keys())
+    history[0, 'batches', 0, keys]
+    history[:, 'batches', 0, keys]
+    history[0, 'batches', :, keys]
+    history[:, 'batches', :, keys]
+    for key in history[0, 'batches', 0]:
+        history[0, 'batches', 0, key]
+        history[:, 'batches', 0, key]
+        history[0, 'batches', :, key]
+        history[:, 'batches', :, key]
+
+    # KEY ERRORS
+    # level 0
+    safe_slice(history, [100000])
+
+    # level 1
+    safe_slice(history, [np.s_[0, 'foo'], np.s_[:, 'foo']])
+
+    # level 2
+    safe_slice(history, [
+        np.s_[0, 'batches', 0],
+        np.s_[:, 'batches', 0],
+        np.s_[0, 'batches', :],
+        np.s_[:, 'batches', :],
+    ])
+
+    # level 3
+    safe_slice(history, [
+        np.s_[0, 'batches', 0, 'foo'],
+        np.s_[:, 'batches', 0, 'foo'],
+        np.s_[0, 'batches', :, 'foo'],
+        np.s_[:, 'batches', :, 'foo'],
+        np.s_[0, 'batches', 0, ('foo', 'bar')],
+        np.s_[:, 'batches', 0, ('foo', 'bar')],
+        np.s_[0, 'batches', :, ('foo', 'bar')],
+        np.s_[:, 'batches', :, ('foo', 'bar')],
+    ])
+
+if __name__ == '__main__':
+    net = train()
+    tic = time.time()
+    for _ in range(1000):
+        performance_history(net.history)
+    toc = time.time()
+    print("Time for performing 1000 runs: {:.5f} sec.".format(toc - tic))
+    assert toc - tic < 10, "accessing history is too slow"
+
+    print("Allocated / cached memory")
+    pprint(side_effects)
+
+    mem_start = side_effects[0][0]
+    mem_end = side_effects[-1][0]
+
+    print("Memory epoch 1: {:.4f}, last epoch: {:.4f}".format(
+        mem_start, mem_end))
+    assert np.isclose(mem_start, mem_end, rtol=1/3), "memory use should be similar"
diff --git a/skorch/history.py b/skorch/history.py
@@ -1,62 +1,71 @@
 """Contains history class and helper functions."""
 
+import warnings
+
 
 # pylint: disable=invalid-name
-class _missingno:
-    def __init__(self, e):
-        self.e = e
-
-    def __repr__(self):
-        return 'missingno'
-
-
-def _incomplete_mapper(x):
-    for xs in x:
-        # pylint: disable=unidiomatic-typecheck
-        if type(xs) is _missingno:
-            return xs
-    return x
-
-
-# pylint: disable=missing-docstring
-def partial_index(l, idx):
-    needs_unrolling = (
-        isinstance(l, list) and len(l) > 0 and isinstance(l[0], list))
-    types = int, tuple, list, slice
-    needs_indirection = isinstance(l, list) and not isinstance(idx, types)
-
-    if needs_unrolling or needs_indirection:
-        return [partial_index(n, idx) for n in l]
-
-    # join results of multiple indices
-    if isinstance(idx, (tuple, list)):
-        zz = [partial_index(l, n) for n in idx]
-        if isinstance(l, list):
-            total_join = zip(*zz)
-            inner_join = list(map(_incomplete_mapper, total_join))
-        else:
-            total_join = tuple(zz)
-            inner_join = _incomplete_mapper(total_join)
-        return inner_join
-
-    try:
-        return l[idx]
-    except KeyError as e:
-        return _missingno(e)
-
-
-# pylint: disable=missing-docstring
-def filter_missing(x):
-    if isinstance(x, list):
-        children = [filter_missing(n) for n in x]
-        # pylint: disable=unidiomatic-typecheck
-        filtered = list(filter(lambda x: type(x) != _missingno, children))
-
-        if children and not filtered:
-            # pylint: disable=unidiomatic-typecheck
-            return next(filter(lambda x: type(x) == _missingno, children))
-        return filtered
-    return x
+class _none:
+    """Special placeholder since ``None`` is a valid value."""
+
+
+def _not_none(items):
+    """Whether the item is a placeholder or contains a placeholder."""
+    if not isinstance(items, (tuple, list)):
+        items = (items,)
+    return all(item is not _none for item in items)
+
+
+def _filter_none(items):
+    """Filter special placeholder value, preserves sequence type."""
+    type_ = list if isinstance(items, list) else tuple
+    return type_(filter(_not_none, items))
+
+
+def _getitem(item, i):
+    """Extract value or values from dicts.
+
+    Covers the case of a single key or multiple keys. If not found,
+    return placeholders instead.
+
+    """
+    if not isinstance(i, (tuple, list)):
+        return item.get(i, _none)
+    type_ = list if isinstance(item, list) else tuple
+    return type_(item.get(j, _none) for j in i)
+
+
+def _unpack_index(i):
+    """Unpack index and return exactly four elements.
+
+    If index is more shallow than 4, return None for trailing
+    dimensions. If index is deeper than 4, raise a KeyError.
+
+    """
+    if len(i) > 4:
+        raise KeyError(
+            "Tried to index history with {} indices but only "
+            "4 indices are possible.".format(len(i)))
+
+    # fill trailing indices with None
+    i_e, k_e, i_b, k_b = i + tuple([None] * (4 - len(i)))
+
+    # handle special case of
+    # history[j, 'batches', somekey]
+    # which should really be
+    # history[j, 'batches', :, somekey]
+    if i_b is not None and not isinstance(i_b, (int, slice)):
+        if k_b is not None:
+            raise KeyError("The last argument '{}' is invalid; it must be a "
+                           "string or tuple of strings.".format(k_b))
+        warnings.warn(
+            "Argument 3 to history slicing must be of type int or slice, e.g. "
+            "history[:, 'batches', 'train_loss'] should be "
+            "history[:, 'batches', :, 'train_loss'].",
+            DeprecationWarning,
+        )
+        i_b, k_b = slice(None), i_b
+
+    return i_e, k_e, i_b, k_b
 
 
 class History(list):
@@ -128,6 +137,7 @@ def new_epoch(self):
 
     def new_batch(self):
         """Register a new batch row for the current epoch."""
+        # pylint: disable=invalid-sequence-index
         self[-1]['batches'].append({})
 
     def record(self, attr, value):
@@ -145,24 +155,67 @@ def record_batch(self, attr, value):
         batch.
 
         """
+        # pylint: disable=invalid-sequence-index
         self[-1]['batches'][-1][attr] = value
 
     def to_list(self):
         """Return history object as a list."""
         return list(self)
 
     def __getitem__(self, i):
+        # This implementation resolves indexing backwards,
+        # i.e. starting from the batches, then progressing to the
+        # epochs.
         if isinstance(i, (int, slice)):
-            return super().__getitem__(i)
-
-        x = self
-        if isinstance(i, tuple):
-            for part in i:
-                x_dirty = partial_index(x, part)
-                x = filter_missing(x_dirty)
-                # pylint: disable=unidiomatic-typecheck
-                if type(x) is _missingno:
-                    raise x.e
-            return x
-        raise ValueError("Invalid parameter type passed to index. "
-                         "Pass string, int or tuple.")
+            i = (i,)
+
+        # i_e: index epoch, k_e: key epoch
+        # i_b: index batch, k_b: key batch
+        i_e, k_e, i_b, k_b = _unpack_index(i)
+        keyerror_msg = "Key '{}' was not found in history."
+
+        if i_b is not None and k_e != 'batches':
+            raise KeyError("History indexing beyond the 2nd level is "
+                           "only possible if key 'batches' is used, "
+                           "found key '{}'.".format(k_e))
+
+        items = self.to_list()
+
+        # extract indices of batches
+        # handles: history[..., k_e, i_b]
+        if i_b is not None:
+            items = [row[k_e][i_b] for row in items]
+
+        # extract keys of batches
+        # handles: history[..., k_e, i_b][k_b]
+        if k_b is not None:
+            items = [
+                _filter_none([_getitem(b, k_b) for b in batches])
+                if isinstance(batches, (list, tuple))
+                else _getitem(batches, k_b)
+                for batches in items
+            ]
+            # get rid of empty batches
+            items = [b for b in items if b not in (_none, [], ())]
+            if not _filter_none(items):
+                # all rows contained _none or were empty
+                raise KeyError(keyerror_msg.format(k_b))
+
+        # extract epoch-level values, but only if not already done
+        # handles: history[..., k_e]
+        if (k_e is not None) and (i_b is None):
+            items = [_getitem(batches, k_e)
+                     for batches in items]
+            if not _filter_none(items):
+                raise KeyError(keyerror_msg.format(k_e))
+
+        # extract the epochs
+        # handles: history[i_b, ..., ..., ...]
+        if i_e is not None:
+            items = items[i_e]
+            if isinstance(i_e, slice):
+                items = _filter_none(items)
+            if items is _none:
+                raise KeyError(keyerror_msg.format(k_e))
+
+        return items
diff --git a/skorch/tests/test_history.py b/skorch/tests/test_history.py