Handle edge cases in beam search (#2557)

epwalsh · brendan-ai2 · commit 720d306bbb56 · 2019-03-14T12:40:40.000-07:00
Addresses the edge case brought up in #2486 (fixes #2486) as well as another. This actually turned out to be a little more nuanced... I first thought the issue brought up was caused by `start_predictions` being the `end_index`, but it actually occurs in general with a beam size of 1 when the first predictions that the step function produces are the `end_index`, regardless of what `start_predictions` are, i.e. at this line: `start_class_log_probabilities, state = step(start_predictions, start_state)` The other edge case is similar, and occurs when the beam size is smaller than the number of valid (non-zero probability) transitions that the step function produces. For example, this could happen in a semantic parsing task where a masked log softmax is used to create predicted log probs for valid next actions. Though this doesn't cause the beam search to crash per se, I thought it would still be good to warn the user in these cases since some of the predicted sequences may be improbable.
diff --git a/allennlp/nn/beam_search.py b/allennlp/nn/beam_search.py
@@ -1,4 +1,5 @@
 from typing import List, Callable, Tuple, Dict
+import warnings
 
 import torch
 
@@ -48,6 +49,16 @@ def search(self,
         Given a starting state and a step function, apply beam search to find the
         most likely target sequences.
 
+        Notes
+        -----
+        If your step function returns ``-inf`` for some log probabilities
+        (like if you're using a masked log-softmax) then some of the "best"
+        sequences returned may also have ``-inf`` log probability. Specifically
+        this happens when the beam size is smaller than the number of actions
+        with finite log probability (non-zero probability) returned by the step function.
+        Therefore if you're using a mask you may want to check the results from ``search``
+        and potentially discard sequences with non-finite log probability.
+
         Parameters
         ----------
         start_predictions : ``torch.Tensor``
@@ -110,6 +121,11 @@ def search(self,
         # shape: (batch_size, beam_size), (batch_size, beam_size)
         start_top_log_probabilities, start_predicted_classes = \
                 start_class_log_probabilities.topk(self.beam_size)
+        if self.beam_size == 1 and (start_predicted_classes == self._end_index).all():
+            warnings.warn("Empty sequences predicted. You may want to increase the beam size or ensure "
+                          "your step function is working properly.",
+                          RuntimeWarning)
+            return start_predicted_classes.unsqueeze(-1), start_top_log_probabilities
 
         # The log probabilities for the last time step.
         # shape: (batch_size, beam_size)
@@ -166,9 +182,9 @@ def search(self,
                     class_log_probabilities
             )
 
+            # shape (both): (batch_size * beam_size, per_node_beam_size)
             top_log_probabilities, predicted_classes = \
                 cleaned_log_probabilities.topk(self.per_node_beam_size)
-            # shape (both): (batch_size * beam_size, per_node_beam_size)
 
             # Here we expand the last log probabilities to (batch_size * beam_size, per_node_beam_size)
             # so that we can add them to the current log probs for this timestep.
@@ -227,6 +243,12 @@ def search(self,
                         gather(1, expanded_backpointer).\
                         reshape(batch_size * self.beam_size, *last_dims)
 
+        if not torch.isfinite(last_log_probabilities).all():
+            warnings.warn("Infinite log probabilities encountered. Some final sequences may not make sense. "
+                          "This can happen when the beam size is larger than the number of valid (non-zero "
+                          "probability) transitions that the step function produces.",
+                          RuntimeWarning)
+
         # Reconstruct the sequences.
         # shape: [(batch_size, beam_size, 1)]
         reconstructed_predictions = [predictions[-1].unsqueeze(2)]
diff --git a/allennlp/tests/nn/beam_search_test.py b/allennlp/tests/nn/beam_search_test.py
@@ -167,3 +167,24 @@ def test_catch_bad_config(self):
         beam_search = BeamSearch(self.end_index, beam_size=20)
         with pytest.raises(ConfigurationError):
             self._check_results(beam_search=beam_search)
+
+    def test_warn_for_bad_log_probs(self):
+        # The only valid next step from the initial predictions is the end index.
+        # But with a beam size of 3, the call to `topk` to find the 3 most likely
+        # next beams will result in 2 new beams that are invalid, in that have probability of 0.
+        # The beam search should warn us of this.
+        initial_predictions = torch.LongTensor([self.end_index-1, self.end_index-1])
+        with pytest.warns(RuntimeWarning, match="Infinite log probabilities"):
+            self.beam_search.search(initial_predictions, {}, take_step)
+
+    def test_empty_sequences(self):
+        initial_predictions = torch.LongTensor([self.end_index-1, self.end_index-1])
+        beam_search = BeamSearch(self.end_index, beam_size=1)
+        with pytest.warns(RuntimeWarning, match="Empty sequences predicted"):
+            predictions, log_probs = beam_search.search(initial_predictions, {}, take_step)
+        # predictions hould have shape `(batch_size, beam_size, max_predicted_length)`.
+        assert list(predictions.size()) == [2, 1, 1]
+        # log probs hould have shape `(batch_size, beam_size)`.
+        assert list(log_probs.size()) == [2, 1]
+        assert (predictions == self.end_index).all()
+        assert (log_probs == 0).all()