Add a correctness test for Open AI transformer (#1801)

matt-peters · web-flow · commit c78bb36a5870 · 2018-09-20T17:37:34.000-07:00
* Add a correctness check for OpenAI BPE encoding

* add test fixtures

* Add correctness test for open ai

* pylint
diff --git a/allennlp/data/token_indexers/openai_transformer_byte_pair_indexer.py b/allennlp/data/token_indexers/openai_transformer_byte_pair_indexer.py
@@ -1,6 +1,7 @@
 from typing import Dict, List, Tuple
 import json
 import tarfile
+import re
 
 from overrides import overrides
 
@@ -11,6 +12,21 @@
 from allennlp.data.tokenizers.token import Token
 from allennlp.data.token_indexers.token_indexer import TokenIndexer
 
+def text_standardize(text):
+    """
+    Apply text standardization following original implementation.
+    """
+    # pylint: disable=anomalous-backslash-in-string
+    text = text.replace('—', '-')
+    text = text.replace('–', '-')
+    text = text.replace('―', '-')
+    text = text.replace('…', '...')
+    text = text.replace('´', "'")
+    text = re.sub('''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text)
+    text = re.sub('\s*\n\s*', ' \n ', text)
+    text = re.sub('[^\S\n]+', ' ', text)
+    return text.strip()
+
 
 @TokenIndexer.register("openai_transformer_byte_pair")
 class OpenaiTransformerBytePairIndexer(TokenIndexer[int]):
@@ -21,6 +37,9 @@ class OpenaiTransformerBytePairIndexer(TokenIndexer[int]):
     This is unlike most of our TokenIndexers in that its
     indexing is not based on a `Vocabulary` but on a fixed
     set of mappings that are loaded by the constructor.
+
+    Note: the original implementation applied ``text_standardize`` before
+    tokenizing.
     """
     # pylint: disable=no-self-use
     def __init__(self,
diff --git a/allennlp/tests/data/token_indexers/openai_transformer_byte_pair_indexer_test.py b/allennlp/tests/data/token_indexers/openai_transformer_byte_pair_indexer_test.py
@@ -1,12 +1,14 @@
 # pylint: disable=no-self-use,invalid-name,protected-access
 import json
 import tarfile
+import spacy
 
 import pytest
 
 from allennlp.common.testing import AllenNlpTestCase
 from allennlp.data import Token
 from allennlp.data.token_indexers import OpenaiTransformerBytePairIndexer
+from allennlp.data.token_indexers.openai_transformer_byte_pair_indexer import text_standardize
 from allennlp.data.vocabulary import Vocabulary
 
 
@@ -99,3 +101,24 @@ def test_raises_with_too_long_sentence(self):
 
         with pytest.raises(RuntimeError):
             self.indexer.tokens_to_indices(tokens, self.vocab, 'should-fail')
+
+    @pytest.mark.skip()
+    def test_for_correctness_with_fixture(self):
+        bpe_path = "https://s3-us-west-2.amazonaws.com/allennlp/models/openai-transformer-lm-2018.07.23.tar.gz"
+        indexer = OpenaiTransformerBytePairIndexer(model_path=bpe_path)
+
+        with open(self.FIXTURES_ROOT / 'openai_transformer' / 'text.txt', 'r') as fin:
+            sentences = fin.read().strip().split('\n')
+        with open(self.FIXTURES_ROOT / 'openai_transformer' / 'indexed_text.json', 'r') as fin:
+            expected_indices = json.load(fin)
+
+        # tokenize and check that indices are correct
+        nlp = spacy.load('en_core_web_sm')
+
+        for k, sentence in enumerate(sentences):
+            tokens = [token.text for token in nlp(text_standardize(sentence)) if not token.is_space]
+            indices = indexer.tokens_to_indices(
+                    [Token(token) for token in tokens], Vocabulary(), 'openai_indexer'
+            )
+            non_padded_indices = [i for i in indices['openai_indexer'] if i != 0]
+            assert non_padded_indices == expected_indices[k]
diff --git a/allennlp/tests/fixtures/openai_transformer/expected_embeddings.hdf5 b/allennlp/tests/fixtures/openai_transformer/expected_embeddings.hdf5
diff --git a/allennlp/tests/fixtures/openai_transformer/indexed_text.json b/allennlp/tests/fixtures/openai_transformer/indexed_text.json
@@ -0,0 +1 @@
+[[249, 1925, 485, 6231, 246, 4121, 669, 1662, 939, 715, 1009, 995, 239, 861, 1081, 822, 37700, 606, 1925, 504, 20267, 239], [2703, 13819, 566, 2795, 525, 487, 980, 538, 999, 524, 1114, 589, 850, 239, 246, 267, 305, 285, 267, 67, 3906, 23, 18493, 13103, 43, 38380, 49, 50, 54, 53, 48, 31446, 13103, 43, 13103, 43, 13103, 43, 13103, 15870, 239]]
diff --git a/allennlp/tests/fixtures/openai_transformer/text.txt b/allennlp/tests/fixtures/openai_transformer/text.txt
@@ -0,0 +1,2 @@
+I decided to rent a movie when friends came over last night. After looking through selections we decided on comedy.
+James realizes one afternoon that he hasn't left his house all day. A !@#!@OOVTOKEN 1234567890551231231231231.
diff --git a/allennlp/tests/modules/token_embedders/openai_transformer_embedder_test.py b/allennlp/tests/modules/token_embedders/openai_transformer_embedder_test.py
@@ -1,8 +1,19 @@
 # pylint: disable=no-self-use,invalid-name
 import pytest
+import spacy
+import torch
+import numpy
+import h5py
 
-from allennlp.common.testing import ModelTestCase
+from allennlp.common.testing import ModelTestCase, AllenNlpTestCase
 from allennlp.data.dataset import Batch
+from allennlp.data import Token
+from allennlp.data.token_indexers import OpenaiTransformerBytePairIndexer
+from allennlp.data.token_indexers.openai_transformer_byte_pair_indexer import text_standardize
+from allennlp.data.vocabulary import Vocabulary
+from allennlp.modules.openai_transformer import OpenaiTransformer
+from allennlp.nn.util import get_range_vector
+
 
 # Skip this one, it's an expensive test.
 @pytest.mark.skip()
@@ -54,6 +65,62 @@ def test_tagger_with_openai_token_embedder_forward_pass_runs_correctly(self):
                 assert tag in {'O', 'I-ORG', 'I-PER', 'I-LOC'}
 
 
+# Skip this one, it's an expensive test.
+@pytest.mark.skip()
+class TestOpenAiTransformerEmbedderCorrectWithFixture(AllenNlpTestCase):
+    """
+    Test that the implementation produces same embeddings as tensorflow model
+    """
+    def test_openai_transformer_matches_tensorflow(self):
+        model_path = "https://s3-us-west-2.amazonaws.com/allennlp/models/openai-transformer-lm-2018.07.23.tar.gz"
+        indexer = OpenaiTransformerBytePairIndexer(model_path=model_path)
+        transformer = OpenaiTransformer(model_path=model_path)
+
+        # get the test sentences
+        with open(self.FIXTURES_ROOT / 'openai_transformer' / 'text.txt', 'r') as fin:
+            sentences = fin.read().strip().split('\n')
+
+        # tokenize and check that indices are correct
+        nlp = spacy.load('en_core_web_sm')
+
+        # make a batch of two sentences
+        batch_indices = []
+        batch_lengths = []
+        for k, sentence in enumerate(sentences):
+            tokens = [token.text for token in nlp(text_standardize(sentence)) if not token.is_space]
+            indices = indexer.tokens_to_indices(
+                    [Token(token) for token in tokens], Vocabulary(), 'openai_indexer'
+            )
+            batch_indices.append(indices['openai_indexer'])
+            batch_lengths.append(len([i for i in indices['openai_indexer'] if i != 0]))
+        batch_indices = torch.from_numpy(numpy.array(batch_indices))
+        batch_size, num_timesteps = batch_indices.size()
+        vocab_size = transformer.vocab_size - transformer.n_ctx
+        positional_encodings = get_range_vector(num_timesteps, device=-1) + vocab_size
+
+        # Combine the inputs with positional encodings
+        batch_tensor = torch.stack([
+                batch_indices,   # (batch_size, num_timesteps)
+                positional_encodings.expand(batch_size, num_timesteps)
+        ], dim=-1)
+
+        # run the LM
+        transformer.eval()
+        activations = transformer(batch_tensor)
+
+        # load the expected activations
+        expected_activations = []
+        with h5py.File(self.FIXTURES_ROOT / 'openai_transformer' / 'expected_embeddings.hdf5', 'r') as fin:
+            expected_activations.append(fin['0'][...])
+            expected_activations.append(fin['1'][...])
+
+        # just check the top layer
+        for k in range(2):
+            actual = activations[-1][k, :batch_lengths[k], :].numpy()
+            expected = expected_activations[k]
+            numpy.testing.assert_almost_equal(expected, actual, decimal=5)
+
+
 def create_small_test_fixture(output_dir: str = '/tmp') -> None:
     """
     This is how I created the transformer_model.tar.gz.
@@ -65,7 +132,6 @@ def create_small_test_fixture(output_dir: str = '/tmp') -> None:
     """
     import json
     import pathlib
-    from allennlp.modules.openai_transformer import OpenaiTransformer
 
     model_dir = pathlib.Path(output_dir) / 'model'
     model_dir.mkdir(exist_ok=True)  # pylint: disable=no-member

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+[[249, 1925, 485, 6231, 246, 4121, 669, 1662, 939, 715, 1009, 995, 239, 861, 1081, 822, 37700, 606, 1925, 504, 20267, 239], [2703, 13819, 566, 2795, 525, 487, 980, 538, 999, 524, 1114, 589, 850, 239, 246, 267, 305, 285, 267, 67, 3906, 23, 18493, 13103, 43, 38380, 49, 50, 54, 53, 48, 31446, 13103, 43, 13103, 43, 13103, 43, 13103, 15870, 239]]`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+I decided to rent a movie when friends came over last night. After looking through selections we decided on comedy.`
	`2`	`+James realizes one afternoon that he hasn't left his house all day. A !@#!@OOVTOKEN 1234567890551231231231231.`