Predictors for demo LMs, update for coref predictor (#3202)

matt-gardner · web-flow · commit f2824fdcf1de · 2019-08-28T14:08:41.000-07:00
* predictors for demo LMs, other fixes

* added test

* More tests

* Add missing method

* Add docstring

* Fix decode methods

* pylint, mypy

* more pylint...

* docs
diff --git a/allennlp/models/masked_language_model.py b/allennlp/models/masked_language_model.py
@@ -145,11 +145,10 @@ def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor
                               for mask_positions in instance_indices])
         output_dict["words"] = top_words
         tokens = []
-        for instance_indices in output_dict['token_ids']:
-            tokens.append([[self.vocab.get_token_from_index(token_id.item(),
-                                                            namespace=self._target_namespace)
-                            for token_id in token_ids]
-                           for token_ids in instance_indices])
+        for instance_tokens in output_dict['token_ids']:
+            tokens.append([self.vocab.get_token_from_index(token_id.item(),
+                                                           namespace=self._target_namespace)
+                           for token_id in instance_tokens])
         output_dict["tokens"] = tokens
 
         return output_dict
diff --git a/allennlp/models/next_token_lm.py b/allennlp/models/next_token_lm.py
@@ -116,11 +116,10 @@ def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor
                                for index in instance_indices]])
             output_dict["words"] = top_words
         tokens = []
-        for instance_indices in output_dict['token_ids']:
-            tokens.append([[self.vocab.get_token_from_index(token_id.item(),
-                                                            namespace=self._target_namespace)
-                            for token_id in token_ids]
-                           for token_ids in instance_indices])
+        for instance_tokens in output_dict['token_ids']:
+            tokens.append([self.vocab.get_token_from_index(token_id.item(),
+                                                           namespace=self._target_namespace)
+                           for token_id in instance_tokens])
         output_dict["tokens"] = tokens
 
         return output_dict
diff --git a/allennlp/nn/util.py b/allennlp/nn/util.py
@@ -1472,3 +1472,27 @@ def inspect_parameters(module: torch.nn.Module, quiet: bool = False) -> Dict[str
     if not quiet:
         print(json.dumps(results, indent=4))
     return results
+
+
+def find_embedding_layer(model: torch.nn.Module) -> torch.nn.Module:
+    """
+    Takes a model (typically an AllenNLP ``Model``, but this works for any ``torch.nn.Module``) and
+    makes a best guess about which module is the embedding layer.  For typical AllenNLP models,
+    this often is the ``TextFieldEmbedder``, but if you're using a pre-trained contextualizer, we
+    really want layer 0 of that contextualizer, not the output.  So there are a bunch of hacks in
+    here for specific pre-trained contextualizers.
+    """
+    # We'll look for a few special cases in a first pass, then fall back to just finding a
+    # TextFieldEmbedder in a second pass if we didn't find a special case.
+    from pytorch_pretrained_bert.modeling import BertEmbeddings
+    from pytorch_transformers.modeling_gpt2 import GPT2Model
+    from allennlp.modules.text_field_embedders.text_field_embedder import TextFieldEmbedder
+    for module in model.modules():
+        if isinstance(module, BertEmbeddings):
+            return module.word_embeddings
+        if isinstance(module, GPT2Model):
+            return module.wte
+    for module in model.modules():
+        if isinstance(module, TextFieldEmbedder):
+            return module
+    raise RuntimeError("No embedding module found!")
diff --git a/allennlp/predictors/__init__.py b/allennlp/predictors/__init__.py
@@ -15,6 +15,8 @@
 from allennlp.predictors.decomposable_attention import DecomposableAttentionPredictor
 from allennlp.predictors.dialog_qa import DialogQAPredictor
 from allennlp.predictors.event2mind import Event2MindPredictor
+from allennlp.predictors.masked_language_model import MaskedLanguageModelPredictor
+from allennlp.predictors.next_token_lm import NextTokenLMPredictor
 from allennlp.predictors.nlvr_parser import NlvrParserPredictor
 from allennlp.predictors.open_information_extraction import OpenIePredictor
 from allennlp.predictors.quarel_parser import QuarelParserPredictor
diff --git a/allennlp/predictors/coref.py b/allennlp/predictors/coref.py
@@ -1,11 +1,14 @@
-from typing import List
+from copy import deepcopy
+from typing import List, Dict
 
 from overrides import overrides
 from spacy.tokens import Doc
+import numpy
 
 from allennlp.common.util import JsonDict
 from allennlp.common.util import get_spacy_model
 from allennlp.data import DatasetReader, Instance
+from allennlp.data.fields import ListField, SequenceLabelField
 from allennlp.models import Model
 from allennlp.predictors.predictor import Predictor
 
@@ -72,6 +75,41 @@ def predict_tokenized(self, tokenized_document: List[str]) -> JsonDict:
         instance = self._words_list_to_instance(tokenized_document)
         return self.predict_instance(instance)
 
+    @overrides
+    def predictions_to_labeled_instances(self,
+                                         instance: Instance,
+                                         outputs: Dict[str, numpy.ndarray]) -> List[Instance]:
+        """
+        Takes each predicted cluster and makes it into a labeled ``Instance`` with only that
+        cluster labeled, so we can compute gradients of the loss `on the model's prediction of that
+        cluster`.  This lets us run interpretation methods using those gradients.  See superclass
+        docstring for more info.
+        """
+        # Digging into an Instance makes mypy go crazy, because we have all kinds of things where
+        # the type has been lost.  So there are lots of `type: ignore`s here...
+        predicted_clusters = outputs['clusters']
+        span_field: ListField = instance['spans']  # type: ignore
+        instances = []
+        for cluster in predicted_clusters:
+            new_instance = deepcopy(instance)
+            span_labels = [0 if (span.span_start, span.span_end) in cluster else -1  # type: ignore
+                           for span in span_field]  # type: ignore
+            new_instance.add_field('span_labels',
+                                   SequenceLabelField(span_labels, span_field),
+                                   self._model.vocab)
+            new_instance['metadata'].metadata['clusters'] = [cluster]  # type: ignore
+            instances.append(new_instance)
+        if not instances:
+            # No predicted clusters; we just give an empty coref prediction.
+            new_instance = deepcopy(instance)
+            span_labels = [-1] * len(span_field)  # type: ignore
+            new_instance.add_field('span_labels',
+                                   SequenceLabelField(span_labels, span_field),
+                                   self._model.vocab)
+            new_instance['metadata'].metadata['clusters'] = []  # type: ignore
+            instances.append(new_instance)
+        return instances
+
     @staticmethod
     def replace_corefs(document: Doc, clusters: List[List[List[int]]]) -> str:
         """
diff --git a/allennlp/predictors/masked_language_model.py b/allennlp/predictors/masked_language_model.py
@@ -0,0 +1,37 @@
+from copy import deepcopy
+from typing import Dict
+
+from overrides import overrides
+import numpy
+
+from allennlp.common.util import JsonDict
+from allennlp.data import Instance, Token
+from allennlp.data.fields import TextField
+from allennlp.predictors.predictor import Predictor
+
+
+@Predictor.register('masked_language_model')
+class MaskedLanguageModelPredictor(Predictor):
+    def predict(self, sentence_with_masks: str) -> JsonDict:
+        return self.predict_json({"sentence" : sentence_with_masks})
+
+    @overrides
+    def predictions_to_labeled_instances(self,
+                                         instance: Instance,
+                                         outputs: Dict[str, numpy.ndarray]):
+        new_instance = deepcopy(instance)
+        token_field: TextField = instance['tokens']  # type: ignore
+        mask_targets = [Token(target_top_k[0]) for target_top_k in outputs['words']]
+        # pylint: disable=protected-access
+        new_instance.add_field('target_ids',
+                               TextField(mask_targets, token_field._token_indexers),
+                               vocab=self._model.vocab)
+        return [new_instance]
+
+    @overrides
+    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
+        """
+        Expects JSON that looks like ``{"sentence": "..."}``.
+        """
+        sentence = json_dict["sentence"]
+        return self._dataset_reader.text_to_instance(sentence=sentence)  # type: ignore
diff --git a/allennlp/predictors/next_token_lm.py b/allennlp/predictors/next_token_lm.py
@@ -0,0 +1,37 @@
+from copy import deepcopy
+from typing import Dict
+
+from overrides import overrides
+import numpy
+
+from allennlp.common.util import JsonDict
+from allennlp.data import Instance, Token
+from allennlp.data.fields import TextField
+from allennlp.predictors.predictor import Predictor
+
+
+@Predictor.register('next_token_lm')
+class NextTokenLMPredictor(Predictor):
+    def predict(self, sentence: str) -> JsonDict:
+        return self.predict_json({"sentence" : sentence})
+
+    @overrides
+    def predictions_to_labeled_instances(self,
+                                         instance: Instance,
+                                         outputs: Dict[str, numpy.ndarray]):
+        new_instance = deepcopy(instance)
+        token_field: TextField = instance['tokens']  # type: ignore
+        mask_targets = [Token(target_top_k[0]) for target_top_k in outputs['words']]
+        # pylint: disable=protected-access
+        new_instance.add_field('target_ids',
+                               TextField(mask_targets, token_field._token_indexers),
+                               vocab=self._model.vocab)
+        return [new_instance]
+
+    @overrides
+    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
+        """
+        Expects JSON that looks like ``{"sentence": "..."}``.
+        """
+        sentence = json_dict["sentence"]
+        return self._dataset_reader.text_to_instance(sentence=sentence)  # type: ignore
diff --git a/allennlp/predictors/predictor.py b/allennlp/predictors/predictor.py
@@ -1,6 +1,7 @@
 from typing import List, Iterator, Dict, Tuple, Any
 import json
 from contextlib import contextmanager
+
 import numpy
 from torch.utils.hooks import RemovableHandle
 from torch import Tensor
@@ -9,10 +10,10 @@
 from allennlp.common.checks import ConfigurationError
 from allennlp.common.util import JsonDict, sanitize
 from allennlp.data import DatasetReader, Instance
+from allennlp.data.dataset import Batch
 from allennlp.models import Model
 from allennlp.models.archival import Archive, load_archive
-from allennlp.modules.text_field_embedders import TextFieldEmbedder
-from allennlp.data.dataset import Batch
+from allennlp.nn import util
 
 # a mapping from model `type` to the default Predictor for that type
 DEFAULT_PREDICTORS = {
@@ -137,10 +138,8 @@ def hook_layers(module, grad_in, grad_out): # pylint: disable=unused-argument
             embedding_gradients.append(grad_out[0])
 
         backward_hooks = []
-        for module in self._model.modules():
-            if isinstance(module, TextFieldEmbedder):
-                backward_hooks.append(module.register_backward_hook(hook_layers))
-
+        embedding_layer = util.find_embedding_layer(self._model)
+        backward_hooks.append(embedding_layer.register_backward_hook(hook_layers))
         return backward_hooks
 
     @contextmanager
@@ -192,7 +191,6 @@ def predictions_to_labeled_instances(self,
         multiple predictions in the output (e.g., in NER a model predicts multiple spans). In this
         case, each instance in the returned list of Instances contains an individual
         entity prediction as the label.
-
         """
         # pylint: disable=unused-argument,no-self-use
         raise RuntimeError("implement this method for model interpretations or attacks")
diff --git a/allennlp/tests/predictors/coref_test.py b/allennlp/tests/predictors/coref_test.py
@@ -1,4 +1,4 @@
-# pylint: disable=no-self-use,invalid-name
+# pylint: disable=no-self-use,invalid-name,protected-access
 import spacy
 
 from allennlp.common.testing import AllenNlpTestCase
@@ -93,3 +93,24 @@ def test_replace_corefs(self):
             doc = nlp(text)
             output = CorefPredictor.replace_corefs(doc, clusters)
             assert output == expected_outputs[i]
+
+    def test_predictions_to_labeled_instances(self):
+        inputs = {"document": "This is a single string document about a test. Sometimes it "
+                              "contains coreferent parts."}
+        archive = load_archive(self.FIXTURES_ROOT / 'coref' / 'serialization' / 'model.tar.gz')
+        predictor = Predictor.from_archive(archive, 'coreference-resolution')
+
+        instance = predictor._json_to_instance(inputs)
+        outputs = predictor._model.forward_on_instance(instance)
+        new_instances = predictor.predictions_to_labeled_instances(instance, outputs)
+        assert new_instances is not None
+
+        for new_instance in new_instances:
+            assert 'span_labels' in new_instance
+            assert len(new_instance['span_labels']) == 60 # 7 words in input
+            true_top_spans = set(tuple(span) for span in outputs['top_spans'])
+            pred_clust_spans = set()
+            for i, span in enumerate(outputs['top_spans']):
+                if new_instance['span_labels'][i]:
+                    pred_clust_spans.add(tuple(span))
+            assert true_top_spans == pred_clust_spans
diff --git a/allennlp/tests/predictors/masked_language_model_test.py b/allennlp/tests/predictors/masked_language_model_test.py
@@ -0,0 +1,23 @@
+# pylint: disable=no-self-use, protected-access
+from allennlp.common.testing import AllenNlpTestCase
+from allennlp.models.archival import load_archive
+from allennlp.predictors import Predictor
+
+from ..modules.language_model_heads.linear import LinearLanguageModelHead  # pylint: disable=unused-import
+
+
+class TestMaskedLanguageModelPredictor(AllenNlpTestCase):
+    def test_predictions_to_labeled_instances(self):
+        inputs = {
+                "sentence": "Eric [MASK] was an intern at [MASK]",
+        }
+
+        archive = load_archive(self.FIXTURES_ROOT / 'masked_language_model' / 'serialization' / 'model.tar.gz')
+        predictor = Predictor.from_archive(archive, 'masked_language_model')
+
+        instance = predictor._json_to_instance(inputs)
+        outputs = predictor._model.forward_on_instance(instance)
+        new_instances = predictor.predictions_to_labeled_instances(instance, outputs)
+        assert len(new_instances) == 1
+        assert 'target_ids' in new_instances[0]
+        assert len(new_instances[0]['target_ids'].tokens) == 2  # should have added two words
diff --git a/allennlp/tests/predictors/next_token_lm_test.py b/allennlp/tests/predictors/next_token_lm_test.py
@@ -0,0 +1,23 @@
+# pylint: disable=no-self-use, protected-access
+from allennlp.common.testing import AllenNlpTestCase
+from allennlp.models.archival import load_archive
+from allennlp.predictors import Predictor
+
+from ..modules.language_model_heads.linear import LinearLanguageModelHead  # pylint: disable=unused-import
+
+
+class TestNextTokenLMPredictor(AllenNlpTestCase):
+    def test_predictions_to_labeled_instances(self):
+        inputs = {
+                "sentence": "Eric Wallace was an intern at",
+        }
+
+        archive = load_archive(self.FIXTURES_ROOT / 'next_token_lm' / 'serialization' / 'model.tar.gz')
+        predictor = Predictor.from_archive(archive, 'next_token_lm')
+
+        instance = predictor._json_to_instance(inputs)
+        outputs = predictor._model.forward_on_instance(instance)
+        new_instances = predictor.predictions_to_labeled_instances(instance, outputs)
+        assert len(new_instances) == 1
+        assert 'target_ids' in new_instances[0]
+        assert len(new_instances[0]['target_ids'].tokens) == 1  # should have added one word
diff --git a/doc/api/allennlp.predictors.rst b/doc/api/allennlp.predictors.rst
@@ -23,6 +23,8 @@ allennlp.predictors
 * :ref:`Event2MindPredictor<event2mind>`
 * :ref:`AtisParserPredictor<atis-parser>`
 * :ref:`TextClassifierPredictor<text_classifier>`
+* :ref:`MaskedLanguageModelPredictor<masked-language-model>`
+* :ref:`NextTokenLMPredictor<next-token-lm>`
 
 .. _predictor:
 .. automodule:: allennlp.predictors.predictor
@@ -131,3 +133,15 @@ allennlp.predictors
    :members:
    :undoc-members:
    :show-inheritance:
+
+.. _masked-language-model:
+.. automodule:: allennlp.predictors.masked_language_model
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. _next-token-lm:
+.. automodule:: allennlp.predictors.next_token_lm
+   :members:
+   :undoc-members:
+   :show-inheritance: