Enable Pruner class to keep different number of items for different entries in minibatch. (#2511)

David Wadden · DeNeutoy · commit f19c0ee2a57b · 2019-03-18T09:34:06.000-07:00
This is a somewhat special case. There are occasionally situations where entries in a minibatch need to be, e.g., ordered sentences from the same document. Coreference resolution is an example. In AllenNLP, an entire document is considered as a single entry, but for very long documents (or on low-memory machines) it might be necessary to split up the document into minibatches during training.
In this situation, smart batching can't be used, and entries in the same minibatch may have widely varying lengths. Keeping the same number of span candidates for each entry may not be desirable.
This PR does the same thing as before if an integer is passed to `num_items_to_keep`. If a tensor is passed instead, it keeps the desired number of items for each minibatch entry.
diff --git a/allennlp/modules/pruner.py b/allennlp/modules/pruner.py
@@ -1,4 +1,4 @@
-from typing import Tuple
+from typing import Tuple, Union
 
 from overrides import overrides
 import torch
@@ -25,13 +25,14 @@ def __init__(self, scorer: torch.nn.Module) -> None:
     def forward(self, # pylint: disable=arguments-differ
                 embeddings: torch.FloatTensor,
                 mask: torch.LongTensor,
-                num_items_to_keep: int) -> Tuple[torch.FloatTensor, torch.LongTensor,
-                                                 torch.LongTensor, torch.FloatTensor]:
+                num_items_to_keep: Union[int, torch.LongTensor]) -> Tuple[torch.FloatTensor, torch.LongTensor,
+                                                                          torch.LongTensor, torch.FloatTensor]:
         """
         Extracts the top-k scoring items with respect to the scorer. We additionally return
         the indices of the top-k in their original order, not ordered by score, so that downstream
         components can rely on the original ordering (e.g., for knowing what spans are valid
-        antecedents in a coreference resolution model).
+        antecedents in a coreference resolution model). May use the same k for all sentences in
+        minibatch, or different k for each.
 
         Parameters
         ----------
@@ -41,26 +42,37 @@ def forward(self, # pylint: disable=arguments-differ
         mask : ``torch.LongTensor``, required.
             A tensor of shape (batch_size, num_items), denoting unpadded elements of
             ``embeddings``.
-        num_items_to_keep : ``int``, required.
-            The number of items to keep when pruning.
+        num_items_to_keep : ``Union[int, torch.LongTensor]``, required.
+            If a tensor of shape (batch_size), specifies the number of items to keep for each
+            individual sentence in minibatch.
+            If an int, keep the same number of items for all sentences.
 
         Returns
         -------
         top_embeddings : ``torch.FloatTensor``
             The representations of the top-k scoring items.
-            Has shape (batch_size, num_items_to_keep, embedding_size).
+            Has shape (batch_size, max_num_items_to_keep, embedding_size).
         top_mask : ``torch.LongTensor``
             The corresponding mask for ``top_embeddings``.
-            Has shape (batch_size, num_items_to_keep).
+            Has shape (batch_size, max_num_items_to_keep).
         top_indices : ``torch.IntTensor``
             The indices of the top-k scoring items into the original ``embeddings``
             tensor. This is returned because it can be useful to retain pointers to
             the original items, if each item is being scored by multiple distinct
-            scorers, for instance. Has shape (batch_size, num_items_to_keep).
+            scorers, for instance. Has shape (batch_size, max_num_items_to_keep).
         top_item_scores : ``torch.FloatTensor``
             The values of the top-k scoring items.
-            Has shape (batch_size, num_items_to_keep, 1).
+            Has shape (batch_size, max_num_items_to_keep, 1).
         """
+        # If an int was given for number of items to keep, construct tensor by repeating the value.
+        if isinstance(num_items_to_keep, int):
+            batch_size = mask.size(0)
+            # Put the tensor on same device as the mask.
+            num_items_to_keep = num_items_to_keep * torch.ones([batch_size], dtype=torch.long,
+                                                               device=mask.device)
+
+        max_items_to_keep = num_items_to_keep.max()
+
         mask = mask.unsqueeze(-1)
         num_items = embeddings.size(1)
         # Shape: (batch_size, num_items, 1)
@@ -73,28 +85,47 @@ def forward(self, # pylint: disable=arguments-differ
         # negative.  These are logits, typically, so -1e20 should be plenty negative.
         scores = util.replace_masked_values(scores, mask, -1e20)
 
-        # Shape: (batch_size, num_items_to_keep, 1)
-        _, top_indices = scores.topk(num_items_to_keep, 1)
+        # Shape: (batch_size, max_num_items_to_keep, 1)
+        _, top_indices = scores.topk(max_items_to_keep, 1)
+
+        # Mask based on number of items to keep for each sentence.
+        # Shape: (batch_size, max_num_items_to_keep)
+        top_indices_mask = util.get_mask_from_sequence_lengths(num_items_to_keep, max_items_to_keep)
+        top_indices_mask = top_indices_mask.byte()
+
+        # Shape: (batch_size, max_num_items_to_keep)
+        top_indices = top_indices.squeeze(-1)
+
+        # Fill all masked indices with largest "top" index for that sentence, so that all masked
+        # indices will be sorted to the end.
+        # Shape: (batch_size, 1)
+        fill_value, _ = top_indices.max(dim=1)
+        fill_value = fill_value.unsqueeze(-1)
+        # Shape: (batch_size, max_num_items_to_keep)
+        top_indices = torch.where(top_indices_mask, top_indices, fill_value)
 
         # Now we order the selected indices in increasing order with
         # respect to their indices (and hence, with respect to the
         # order they originally appeared in the ``embeddings`` tensor).
         top_indices, _ = torch.sort(top_indices, 1)
 
-        # Shape: (batch_size, num_items_to_keep)
-        top_indices = top_indices.squeeze(-1)
-
-        # Shape: (batch_size * num_items_to_keep)
+        # Shape: (batch_size * max_num_items_to_keep)
         # torch.index_select only accepts 1D indices, but here
         # we need to select items for each element in the batch.
         flat_top_indices = util.flatten_and_batch_shift_indices(top_indices, num_items)
 
-        # Shape: (batch_size, num_items_to_keep, embedding_size)
+        # Shape: (batch_size, max_num_items_to_keep, embedding_size)
         top_embeddings = util.batched_index_select(embeddings, top_indices, flat_top_indices)
-        # Shape: (batch_size, num_items_to_keep)
-        top_mask = util.batched_index_select(mask, top_indices, flat_top_indices)
 
-        # Shape: (batch_size, num_items_to_keep, 1)
+        # Combine the masks on spans that are out-of-bounds, and the mask on spans that are outside
+        # the top k for each sentence.
+        # Shape: (batch_size, max_num_items_to_keep)
+        sequence_mask = util.batched_index_select(mask, top_indices, flat_top_indices)
+        sequence_mask = sequence_mask.squeeze(-1).byte()
+        top_mask = top_indices_mask & sequence_mask
+        top_mask = top_mask.long()
+
+        # Shape: (batch_size, max_num_items_to_keep, 1)
         top_scores = util.batched_index_select(scores, top_indices, flat_top_indices)
 
-        return top_embeddings, top_mask.squeeze(-1), top_indices, top_scores
+        return top_embeddings, top_mask, top_indices, top_scores
diff --git a/allennlp/tests/modules/pruner_test.py b/allennlp/tests/modules/pruner_test.py
@@ -1,4 +1,4 @@
-# pylint: disable=no-self-use,invalid-name
+# pylint: disable=no-self-use,invalid-name,not-callable
 import numpy
 import pytest
 import torch
@@ -83,3 +83,80 @@ def test_scorer_works_for_completely_masked_rows(self):
         numpy.testing.assert_array_equal(correct_scores[:2], pruned_scores[:2].data.numpy())
         numpy.testing.assert_array_equal(pruned_scores[2] < -1e15, [[1], [1]])
         numpy.testing.assert_array_equal(pruned_scores[2] == float('-inf'), [[0], [0]])
+
+    def test_pruner_selects_top_scored_items_and_respects_masking_different_num_items(self):
+        # Really simple scorer - sum up the embedding_dim.
+        scorer = lambda tensor: tensor.sum(-1).unsqueeze(-1)
+        pruner = Pruner(scorer=scorer)
+
+        items = torch.randn([3, 4, 5]).clamp(min=0.0, max=1.0)
+        items[0, 0, :] = 1.5
+        items[0, 1, :] = 2
+        items[0, 3, :] = 1
+        items[1, 1:3, :] = 1
+        items[2, 0, :] = 1
+        items[2, 1, :] = 2
+        items[2, 2, :] = 1.5
+
+        mask = torch.ones([3, 4])
+        mask[1, 3] = 0
+
+        num_items_to_keep = torch.tensor([3, 2, 1], dtype=torch.long)
+
+        pruned_embeddings, pruned_mask, pruned_indices, pruned_scores = pruner(
+                items, mask, num_items_to_keep)
+
+        # Second element in the batch would have indices 2, 3, but
+        # 3 and 0 are masked, so instead it has 1, 2.
+        numpy.testing.assert_array_equal(pruned_indices.data.numpy(), numpy.array([[0, 1, 3],
+                                                                                   [1, 2, 2],
+                                                                                   [1, 2, 2]]))
+        numpy.testing.assert_array_equal(pruned_mask.data.numpy(), numpy.array([[1, 1, 1],
+                                                                                [1, 1, 0],
+                                                                                [1, 0, 0]]))
+
+        # embeddings should be the result of index_selecting the pruned_indices.
+        correct_embeddings = batched_index_select(items, pruned_indices)
+        numpy.testing.assert_array_equal(correct_embeddings.data.numpy(),
+                                         pruned_embeddings.data.numpy())
+        # scores should be the sum of the correct embedding elements.
+        numpy.testing.assert_array_equal(correct_embeddings.sum(-1).unsqueeze(-1).data.numpy(),
+                                         pruned_scores.data.numpy())
+
+    def test_pruner_works_for_row_with_no_items_requested(self):
+        # Case where `num_items_to_keep` is a tensor rather than an int. Make sure it does the right
+        # thing when no items are requested for one of the rows.
+        scorer = lambda tensor: tensor.sum(-1).unsqueeze(-1)
+        pruner = Pruner(scorer=scorer)
+
+        items = torch.randn([3, 4, 5]).clamp(min=0.0, max=1.0)
+        items[0, :3, :] = 1
+        items[1, 2:, :] = 1
+        items[2, 2:, :] = 1
+
+        mask = torch.ones([3, 4])
+        mask[1, 0] = 0
+        mask[1, 3] = 0
+
+        num_items_to_keep = torch.tensor([3, 2, 0], dtype=torch.long)
+
+        pruned_embeddings, pruned_mask, pruned_indices, pruned_scores = pruner(
+                items, mask, num_items_to_keep)
+
+        # First element just picks top three entries. Second would pick entries 2 and 3, but 0 and 3
+        # are masked, so it takes 1 and 2 (repeating the second index). The third element is
+        # entirely masked and just repeats the largest index with a top-3 score.
+        numpy.testing.assert_array_equal(pruned_indices.data.numpy(), numpy.array([[0, 1, 2],
+                                                                                   [1, 2, 2],
+                                                                                   [3, 3, 3]]))
+        numpy.testing.assert_array_equal(pruned_mask.data.numpy(), numpy.array([[1, 1, 1],
+                                                                                [1, 1, 0],
+                                                                                [0, 0, 0]]))
+
+        # embeddings should be the result of index_selecting the pruned_indices.
+        correct_embeddings = batched_index_select(items, pruned_indices)
+        numpy.testing.assert_array_equal(correct_embeddings.data.numpy(),
+                                         pruned_embeddings.data.numpy())
+        # scores should be the sum of the correct embedding elements.
+        numpy.testing.assert_array_equal(correct_embeddings.sum(-1).unsqueeze(-1).data.numpy(),
+                                         pruned_scores.data.numpy())