allenai · dirkgr · Aug 25, 2021 · Aug 7, 2021 · Aug 7, 2021 · Aug 7, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -30,6 +30,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added `ScaledDotProductMatrixAttention`, and converted the transformer toolkit to use it
 - Added tests to ensure that all `Attention` and `MatrixAttention` implementations are interchangeable
 - Added a way for AllenNLP Tango to read and write datasets lazily.
+- Added a way to remix datasets flexibly
 
 ### Fixed
 
@@ -42,6 +43,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `ConfigurationError` is now pickleable.
 - Multitask models now support `TextFieldTensor` in heads, not just in the backbone.
 - Fixed the signature of `ScaledDotProductAttention` to match the other `Attention` classes
+- Fixed the way names are applied to Tango `Step` instances.
 
 ### Changed
 

diff --git a/allennlp/common/det_hash.py b/allennlp/common/det_hash.py
@@ -1,6 +1,7 @@
+import collections
 import hashlib
 import io
-from typing import Any
+from typing import Any, MutableMapping
 
 import base58
 import dill
@@ -13,6 +14,9 @@ def det_hash_object(self) -> Any:
         representation. Sometimes you want to take control over what goes into
         that hash. In that case, implement this method. `det_hash()` will pickle the
         result of this method instead of the object itself.
+
+        If you return `None`, `det_hash()` falls back to the original behavior and pickles
+        the object.
         """
         raise NotImplementedError()
 
@@ -38,10 +42,48 @@ def det_hash_object(self) -> Any:
         return self._det_hash_object
 
 
+class DetHashWithVersion(CustomDetHash):
+    """
+    Add this class as a mixing base class to make sure your class's det_hash can be modified
+    by altering a static `VERSION` member of your class.
+    """
+
+    VERSION = None
+
+    def det_hash_object(self) -> Any:
+        if self.VERSION is not None:
+            return self.VERSION, self
+        else:
+            return None
+
+
 class _DetHashPickler(dill.Pickler):
+    def __init__(self, buffer: io.BytesIO):
+        super().__init__(buffer)
+
+        # We keep track of how deeply we are nesting the pickling of an object.
+        # If a class returns `self` as part of `det_hash_object()`, it causes an
+        # infinite recursion, because we try to pickle the `det_hash_object()`, which
+        # contains `self`, which returns a `det_hash_object()`, etc.
+        # So we keep track of how many times recursively we are trying to pickle the
+        # same object. We only call `det_hash_object()` the first time. We assume that
+        # if `det_hash_object()` returns `self` in any way, we want the second time
+        # to just pickle the object as normal. `DetHashWithVersion` takes advantage
+        # of this ability.
+        self.recursively_pickled_ids: MutableMapping[int, int] = collections.Counter()
+
+    def save(self, obj, save_persistent_id=True):
+        self.recursively_pickled_ids[id(obj)] += 1
+        super().save(obj, save_persistent_id)
+        self.recursively_pickled_ids[id(obj)] -= 1
+
     def persistent_id(self, obj: Any) -> Any:
-        if isinstance(obj, CustomDetHash):
-            return obj.__class__.__qualname__, obj.det_hash_object()
+        if isinstance(obj, CustomDetHash) and self.recursively_pickled_ids[id(obj)] <= 1:
+            det_hash_object = obj.det_hash_object()
+            if det_hash_object is not None:
+                return obj.__class__.__module__, obj.__class__.__qualname__, det_hash_object
+            else:
+                return None
         elif isinstance(obj, type):
             return obj.__module__, obj.__qualname__
         else:

diff --git a/allennlp/common/sequences.py b/allennlp/common/sequences.py
@@ -0,0 +1,85 @@
+import bisect
+import random
+from collections import abc
+from typing import Sequence, Optional, Union
+
+
+class ShuffledSequence(abc.Sequence):
+    """
+    Produces a shuffled view of a sequence, such as a list.
+
+    This assumes that the inner sequence never changes. If it does, the results
+    are undefined.
+    """
+
+    def __init__(self, inner_sequence: Sequence, indices: Optional[Sequence[int]] = None):
+        self.inner = inner_sequence
+        self.indices: Sequence[int]
+        if indices is None:
+            self.indices = list(range(len(inner_sequence)))
+            random.shuffle(self.indices)
+        else:
+            self.indices = indices
+
+    def __len__(self) -> int:
+        return len(self.indices)
+
+    def __getitem__(self, i: Union[int, slice]):
+        if isinstance(i, int):
+            return self.inner[self.indices[i]]
+        else:
+            return ShuffledSequence(self.inner, self.indices[i])
+
+    def __contains__(self, item) -> bool:
+        for i in self.indices:
+            if self.inner[i] == item:
+                return True
+        return False
+
+
+class SlicedSequence(ShuffledSequence):
+    """
+    Produces a sequence that's a slice into another sequence, without copying the elements.
+
+    This assumes that the inner sequence never changes. If it does, the results
+    are undefined.
+    """
+
+    def __init__(self, inner_sequence: Sequence, s: slice):
+        super().__init__(inner_sequence, range(*s.indices(len(inner_sequence))))
+
+
+class ConcatenatedSequence(abc.Sequence):
+    """
+    Produces a sequence that's the concatenation of multiple other sequences, without
+    copying the elements.
+
+    This assumes that the inner sequence never changes. If it does, the results
+    are undefined.
+    """
+
+    def __init__(self, *sequences: Sequence):
+        self.sequences = sequences
+        self.cumulative_sequence_lengths = [0]
+        for sequence in sequences:
+            self.cumulative_sequence_lengths.append(
+                self.cumulative_sequence_lengths[-1] + len(sequence)
+            )
+
+    def __len__(self):
+        return self.cumulative_sequence_lengths[-1]
+
+    def __getitem__(self, i: Union[int, slice]):
+        if isinstance(i, int):
+            if i < 0:
+                i += len(self)
+            if i < 0 or i >= len(self):
+                raise IndexError("list index out of range")
+            sequence_index = bisect.bisect_right(self.cumulative_sequence_lengths, i) - 1
+            i -= self.cumulative_sequence_lengths[sequence_index]
+            return self.sequences[sequence_index][i]
+        else:
+            return SlicedSequence(self, i)
+
+    def __contains__(self, item) -> bool:
+        return any(s.__contains__(item) for s in self.sequences)
diff --git a/allennlp/common/sqlite_sparse_sequence.py b/allennlp/common/sqlite_sparse_sequence.py
@@ -2,10 +2,9 @@
 import shutil
 from os import PathLike
 from typing import MutableSequence, Any, Union, Iterable
-
 from sqlitedict import SqliteDict
 
-from allennlp.tango.dataloader import ShuffledSequence
+from allennlp.common.sequences import SlicedSequence
 
 
 class SqliteSparseSequence(MutableSequence[Any]):
@@ -28,7 +27,7 @@ def __getitem__(self, i: Union[int, slice]) -> Any:
                 else:
                     return None
         elif isinstance(i, slice):
-            return ShuffledSequence(self, range(*i.indices(len(self))))
+            return SlicedSequence(self, i)
         else:
             raise TypeError(f"list indices must be integers or slices, not {i.__class__.__name__}")
 

diff --git a/allennlp/tango/dataloader.py b/allennlp/tango/dataloader.py
@@ -4,10 +4,8 @@
 """
 
 import logging
-import random
-from collections import abc
 from math import floor, ceil
-from typing import Optional, Iterator, Sequence, Union, Dict, Any
+from typing import Optional, Iterator, Sequence, Dict, Any
 
 import more_itertools
 import torch
@@ -22,6 +20,7 @@
     Vocabulary,
 )
 from allennlp.nn.util import move_to_device
+from allennlp.common.sequences import ShuffledSequence
 
 
 class TangoDataLoader(Registrable):
@@ -86,36 +85,6 @@ def set_target_device(self, device: torch.device) -> None:
         self.target_device = device
 
 
-class ShuffledSequence(abc.Sequence):
-    """
-    Produces a shuffled view of a sequence, such as a list.
-
-    This assumes that the inner sequence never changes. If it does, the results
-    are undefined.
-    """
-
-    def __init__(self, inner_sequence: Sequence, indices: Optional[Sequence[int]] = None):
-        self.inner = inner_sequence
-        self.indices: Sequence[int]
-        if indices is None:
-            self.indices = list(range(len(inner_sequence)))
-            random.shuffle(self.indices)
-        else:
-            self.indices = indices
-
-    def __len__(self) -> int:
-        return len(self.inner)
-
-    def __getitem__(self, i: Union[int, slice]):
-        if isinstance(i, int):
-            return self.inner[self.indices[i]]
-        else:
-            return ShuffledSequence(self.inner, self.indices[i])
-
-    def __contains__(self, item) -> bool:
-        return self.inner.__contains__(item)
-
-
 @TangoDataLoader.register("batch_size")
 class BatchSizeDataLoader(TangoDataLoader):
     """A data loader that turns instances into batches with a constant number of instances

diff --git a/allennlp/tango/dataset.py b/allennlp/tango/dataset.py
@@ -4,11 +4,13 @@
 """
 
 import itertools
+import re
 from dataclasses import dataclass, field
 from typing import Mapping, Any, Optional, Sequence, Dict
 
 from allennlp.data import Vocabulary, DatasetReader, Instance
 from allennlp.tango.step import Step
+from allennlp.common.sequences import SlicedSequence, ConcatenatedSequence
 from tqdm import tqdm
 
 
@@ -39,9 +41,9 @@ def __len__(self) -> int:
 @Step.register("dataset_reader_adapter")
 class DatasetReaderAdapterStep(Step):
     """
-    This step creates an `AllenNlpDataset` from old-school dataset readers. If you're
+    This step creates an `DatasetDict` from old-school dataset readers. If you're
     tempted to write a new `DatasetReader`, and then use this step with it, don't.
-    Just write a `Step` that creates the `AllenNlpDataset` you need directly.
+    Just write a `Step` that creates the `DatasetDict` you need directly.
     """
 
     DETERMINISTIC = True  # We're giving the dataset readers some credit here.
@@ -72,3 +74,46 @@ def run(self, reader: DatasetReader, splits: Dict[str, str]) -> DatasetDict:  #
                 instance.index_fields(vocab)
 
         return DatasetDict(splits=instances_map, vocab=vocab)
+
+
+@Step.register("dataset_remix")
+class DatasetRemixStep(Step):
+    """
+    This step can remix splits in a dataset into new splits.
+    """
+
+    DETERMINISTIC = True
+    CACHEABLE = False  # This is so fast it's not worth caching.
+    VERSION = "001"
+
+    def run(  # type: ignore
+        self, input: DatasetDict, new_splits: Dict[str, str], keep_old_splits: bool = True
+    ) -> DatasetDict:
+        def get_slice(split_name: str) -> Sequence[Any]:
+            slice_match = re.match(r"(.*)\[([0123456789:]*)]", split_name)
+            if slice_match is None:
+                return input[split_name]
+            else:
+                split_name = slice_match[1]
+                slice_args = [int(a) if len(a) > 0 else None for a in slice_match[2].split(":")]
+                return SlicedSequence(input[split_name], slice(*slice_args))
+
+        def parse_split_spec(split_spec: str):
+            parts = [get_slice(name.strip()) for name in split_spec.split("+")]
+            if len(parts) == 1:
+                return parts[0]
+            else:
+                return ConcatenatedSequence(*parts)
+
+        if keep_old_splits:
+            result = dict(input.splits.items())
+        else:
+            result = {}
+        result.update(
+            {
+                new_split_name: parse_split_spec(new_split_spec)
+                for new_split_name, new_split_spec in new_splits.items()
+            }
+        )
+
+        return DatasetDict(vocab=input.vocab, metadata=input.metadata, splits=result)
diff --git a/allennlp/tango/hf_dataset.py b/allennlp/tango/hf_dataset.py
@@ -10,7 +10,7 @@
 
 @Step.register("hf_dataset")
 class HuggingfaceDataset(Step):
-    """This steps reads a huggingface dataset and returns it in `AllenNlpDataset` format."""
+    """This steps reads a huggingface dataset and returns it in `DatasetDict` format."""
 
     DETERMINISTIC = True
     VERSION = "001"