Model can store extra pretained embeddings (#1817)

OyvindTafjord · DeNeutoy · commit 02e2930b38a3 · 2018-09-25T11:40:28.000-07:00
This adds a `min_pretrained_embeddings` parameter to the `"embedding"` token embedder which will keep at least that many embeddings from the top of an embedding text file (like Glove). This is useful as a pragmatic way to support unseen words, say in a demo, at a cost of larger model size (e.g., specifying 200k here will increase model.tar.gz by about 75MB). This leverages the fact that at least for Glove files, the words are ordered by frequency, so by the time you get to, say, 200k, you're mostly missing out on rare words where the embeddings are less useful anyway. I'd use this in the QuaRel demo.
diff --git a/allennlp/modules/token_embedders/embedding.py b/allennlp/modules/token_embedders/embedding.py
@@ -182,6 +182,7 @@ def from_params(cls, vocab: Vocabulary, params: Params) -> 'Embedding':  # type:
         norm_type = params.pop_float('norm_type', 2.)
         scale_grad_by_freq = params.pop_bool('scale_grad_by_freq', False)
         sparse = params.pop_bool('sparse', False)
+        min_pretrained_embeddings = params.pop_int("min_pretrained_embeddings", 0)
         params.assert_empty(cls.__name__)
 
         if pretrained_file:
@@ -191,7 +192,10 @@ def from_params(cls, vocab: Vocabulary, params: Params) -> 'Embedding':  # type:
             weight = _read_pretrained_embeddings_file(pretrained_file,
                                                       embedding_dim,
                                                       vocab,
-                                                      vocab_namespace)
+                                                      vocab_namespace,
+                                                      min_pretrained_embeddings)
+            if min_pretrained_embeddings > 0:
+                num_embeddings = vocab.get_vocab_size(vocab_namespace)
         else:
             weight = None
 
@@ -210,7 +214,8 @@ def from_params(cls, vocab: Vocabulary, params: Params) -> 'Embedding':  # type:
 def _read_pretrained_embeddings_file(file_uri: str,
                                      embedding_dim: int,
                                      vocab: Vocabulary,
-                                     namespace: str = "tokens") -> torch.FloatTensor:
+                                     namespace: str = "tokens",
+                                     min_pretrained_embeddings: int = None) -> torch.FloatTensor:
     """
     Returns and embedding matrix for the given vocabulary using the pretrained embeddings
     contained in the given file. Embeddings for tokens not found in the pretrained embedding file
@@ -244,8 +249,9 @@ def _read_pretrained_embeddings_file(file_uri: str,
         A Vocabulary object.
     namespace : str, (optional, default=tokens)
         The namespace of the vocabulary to find pretrained embeddings for.
-    trainable : bool, (optional, default=True)
-        Whether or not the embedding parameters should be optimized.
+    min_pretrained_embeddings : int, (optional, default=None):
+        If given, will keep at least this number of embeddings from the start of the pretrained
+        embedding text file (typically the most common words)
 
     Returns
     -------
@@ -261,13 +267,14 @@ def _read_pretrained_embeddings_file(file_uri: str,
 
     return _read_embeddings_from_text_file(file_uri,
                                            embedding_dim,
-                                           vocab, namespace)
+                                           vocab, namespace, min_pretrained_embeddings)
 
 
 def _read_embeddings_from_text_file(file_uri: str,
                                     embedding_dim: int,
                                     vocab: Vocabulary,
-                                    namespace: str = "tokens") -> torch.FloatTensor:
+                                    namespace: str = "tokens",
+                                    min_pretrained_embeddings: int = 0) -> torch.FloatTensor:
     """
     Read pre-trained word vectors from an eventually compressed text file, possibly contained
     inside an archive with multiple files. The text file is assumed to be utf-8 encoded with
@@ -278,16 +285,15 @@ def _read_embeddings_from_text_file(file_uri: str,
     The remainder of the docstring is identical to ``_read_pretrained_embeddings_file``.
     """
     tokens_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values())
-    vocab_size = vocab.get_vocab_size(namespace)
     embeddings = {}
 
     # First we read the embeddings from the file, only keeping vectors for the words we need.
     logger.info("Reading pretrained embeddings from file")
 
     with EmbeddingsTextFile(file_uri) as embeddings_file:
-        for line in Tqdm.tqdm(embeddings_file):
+        for index, line in Tqdm.tqdm(enumerate(embeddings_file)):
             token = line.split(' ', 1)[0]
-            if token in tokens_to_keep:
+            if token in tokens_to_keep or index < min_pretrained_embeddings:
                 fields = line.rstrip().split(' ')
                 if len(fields) - 1 != embedding_dim:
                     # Sometimes there are funny unicode parsing problems that lead to different
@@ -303,6 +309,10 @@ def _read_embeddings_from_text_file(file_uri: str,
 
                 vector = numpy.asarray(fields[1:], dtype='float32')
                 embeddings[token] = vector
+                if token not in tokens_to_keep:
+                    vocab.add_token_to_namespace(token, namespace)
+
+    vocab_size = vocab.get_vocab_size(namespace)
 
     if not embeddings:
         raise ConfigurationError("No embeddings of correct dimension found; you probably "
diff --git a/allennlp/tests/modules/token_embedders/embedding_test.py b/allennlp/tests/modules/token_embedders/embedding_test.py
@@ -53,6 +53,20 @@ def test_forward_works_with_projection_layer(self):
         embedded = embedding_layer(input_tensor).data.numpy()
         assert embedded.shape == (1, 1, 4, 20)
 
+    def test_min_pretrained_embeddings(self):
+        vocab = Vocabulary()
+        vocab.add_token_to_namespace('the')
+        vocab.add_token_to_namespace('a')
+        params = Params({
+                'pretrained_file': str(self.FIXTURES_ROOT / 'embeddings/glove.6B.100d.sample.txt.gz'),
+                'embedding_dim': 100,
+                'min_pretrained_embeddings': 50
+                })
+        # This will now update vocab
+        _ = Embedding.from_params(vocab, params)
+        assert vocab.get_vocab_size() >= 50
+        assert vocab.get_token_index("his") > 1  # not @@UNKNOWN@@
+
     def test_embedding_layer_actually_initializes_word_vectors_correctly(self):
         vocab = Vocabulary()
         vocab.add_token_to_namespace("word")