make openai transformer byte pair indexer add to the vocab (#1705)

joelgrus · DeNeutoy · commit 3f54fc85a3d1 · 2018-09-10T12:58:51.000-07:00
fixes #1700
diff --git a/allennlp/data/token_indexers/openai_transformer_byte_pair_indexer.py b/allennlp/data/token_indexers/openai_transformer_byte_pair_indexer.py
@@ -27,7 +27,10 @@ def __init__(self,
                  encoder: Dict[str, int] = None,
                  byte_pairs: List[Tuple[str, str]] = None,
                  n_ctx: int = 512,
-                 model_path: str = None) -> None:
+                 model_path: str = None,
+                 namespace: str = 'openai_transformer') -> None:
+        self._namespace = namespace
+        self._added_to_vocabulary = False
 
         too_much_information = model_path and (encoder or byte_pairs)
         too_little_information = not model_path and not (encoder and byte_pairs)
@@ -143,12 +146,21 @@ def byte_pair_encode(self, token: Token, lowercase: bool = True) -> List[str]:
         self.cache[text] = word
         return word
 
+    def _add_encoding_to_vocabulary(self, vocabulary: Vocabulary) -> None:
+        # pylint: disable=protected-access
+        for word, idx in self.encoder.items():
+            vocabulary._token_to_index[self._namespace][word] = idx
+            vocabulary._index_to_token[self._namespace][idx] = word
 
     @overrides
     def tokens_to_indices(self,
                           tokens: List[Token],
-                          _vocabulary: Vocabulary,
+                          vocabulary: Vocabulary,
                           index_name: str) -> Dict[str, List[int]]:
+        if not self._added_to_vocabulary:
+            self._add_encoding_to_vocabulary(vocabulary)
+            self._added_to_vocabulary = True
+
         text_tokens = []
         offsets = []
         offset = -1
diff --git a/allennlp/tests/data/token_indexers/openai_transformer_byte_pair_indexer_test.py b/allennlp/tests/data/token_indexers/openai_transformer_byte_pair_indexer_test.py
@@ -1,4 +1,4 @@
-# pylint: disable=no-self-use,invalid-name
+# pylint: disable=no-self-use,invalid-name,protected-access
 import json
 import tarfile
 
@@ -7,6 +7,7 @@
 from allennlp.common.testing import AllenNlpTestCase
 from allennlp.data import Token
 from allennlp.data.token_indexers import OpenaiTransformerBytePairIndexer
+from allennlp.data.vocabulary import Vocabulary
 
 
 class TestOpenaiTransformerBytePairIndexer(AllenNlpTestCase):
@@ -39,6 +40,7 @@ def setUp(self):
             tf.add(bpe_path, 'model/vocab_40000.bpe')
 
         self.indexer = OpenaiTransformerBytePairIndexer(encoding, byte_pairs)
+        self.vocab = Vocabulary(non_padded_namespaces=['openai_transformer'])
 
     def test_bpe(self):
 
@@ -63,7 +65,17 @@ def test_bpe(self):
     def test_tokens_to_indices(self):
         tokens = [Token('ewoe'), Token('woe'), Token('ewe'), Token('ee')]
 
-        indices = self.indexer.tokens_to_indices(tokens, None, 'test')
+        # vocab should be empty initially
+        assert 'openai_transformer' not in self.vocab._index_to_token
+        assert 'openai_transformer' not in self.vocab._token_to_index
+
+        indices = self.indexer.tokens_to_indices(tokens, self.vocab, 'test')
+
+        # vocab should be full now
+        i2t = self.vocab._index_to_token.get('openai_transformer')
+        t2i = self.vocab._token_to_index.get('openai_transformer')
+        assert len(i2t) == 5 * 5 * 2
+        assert len(t2i) == 5 * 5 * 2
 
         assert set(indices.keys()) == {"test", "test-offsets", "mask"}
 
@@ -86,4 +98,4 @@ def test_raises_with_too_long_sentence(self):
         tokens = [Token('a') for _ in range(513)]
 
         with pytest.raises(RuntimeError):
-            self.indexer.tokens_to_indices(tokens, None, 'should-fail')
+            self.indexer.tokens_to_indices(tokens, self.vocab, 'should-fail')