Fix wordpiece indexer truncation (#2931)

maksymbevza · joelgrus · commit 7e08298e7d17 · 2019-06-20T11:51:14.000-07:00
* Fix wordpiece indexer

* Add comments for test and count pieces accumulated
diff --git a/allennlp/data/token_indexers/wordpiece_indexer.py b/allennlp/data/token_indexers/wordpiece_indexer.py
@@ -181,10 +181,12 @@ def tokens_to_indices(self,
         # offset is the last wordpiece of "tokens[-1]".
         offset = len(self._start_piece_ids) if self.use_starting_offsets else len(self._start_piece_ids) - 1
 
+        # Count amount of wordpieces accumulated
+        pieces_accumulated = 0
         for token in token_wordpiece_ids:
             # Truncate the sequence if specified, which depends on where the offsets are
             next_offset = 1 if self.use_starting_offsets else 0
-            if self._truncate_long_sequences and offset >= window_length + next_offset:
+            if self._truncate_long_sequences and offset + len(token) - 1 >= window_length + next_offset:
                 break
 
             # For initial offsets, the current value of ``offset`` is the start of
@@ -198,15 +200,17 @@ def tokens_to_indices(self,
                 offset += len(token)
                 offsets.append(offset)
 
+            pieces_accumulated += len(token)
+
         if len(flat_wordpiece_ids) <= window_length:
             # If all the wordpieces fit, then we don't need to do anything special
             wordpiece_windows = [self._add_start_and_end(flat_wordpiece_ids)]
             token_type_ids = self._extend(flat_token_type_ids)
         elif self._truncate_long_sequences:
             logger.warning("Too many wordpieces, truncating sequence. If you would like a sliding window, set"
                            "`truncate_long_sequences` to False %s", str([token.text for token in tokens]))
-            wordpiece_windows = [self._add_start_and_end(flat_wordpiece_ids[:window_length])]
-            token_type_ids = self._extend(flat_token_type_ids[:window_length])
+            wordpiece_windows = [self._add_start_and_end(flat_wordpiece_ids[:pieces_accumulated])]
+            token_type_ids = self._extend(flat_token_type_ids[:pieces_accumulated])
         else:
             # Create a sliding window of wordpieces of length `max_pieces` that advances by `stride` steps and
             # add start/end wordpieces to each window
diff --git a/allennlp/tests/data/token_indexers/bert_indexer_test.py b/allennlp/tests/data/token_indexers/bert_indexer_test.py
@@ -203,3 +203,83 @@ def test_truncate_window(self):
         # 1 full window + 1 half window with start/end tokens
         assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 17]
         assert indexed_tokens["bert-offsets"] == [1, 3, 4, 5, 6, 7, 8]
+
+    def test_truncate_window_dont_split_wordpieces(self):
+        """
+        Tests if the sentence is not truncated inside of the word with 2 or
+        more wordpieces.
+        """
+
+        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())
+
+        sentence = "the quickest quick brown fox jumped over the quickest dog"
+        tokens = tokenizer.tokenize(sentence)
+
+        vocab = Vocabulary()
+        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
+        token_indexer = PretrainedBertIndexer(str(vocab_path),
+                                              truncate_long_sequences=True,
+                                              use_starting_offsets=True,
+                                              max_pieces=12)
+
+        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")
+
+        # 16 = [CLS], 17 = [SEP]
+        # 1 full window + 1 half window with start/end tokens
+        assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 17]
+        # We could fit one more piece here, but we don't, not to have a cut
+        # in the middle of the word
+        assert indexed_tokens["bert-offsets"] == [1, 2, 4, 5, 6, 7, 8, 9]
+        assert indexed_tokens["bert-type-ids"] == [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+
+        token_indexer = PretrainedBertIndexer(str(vocab_path),
+                                              truncate_long_sequences=True,
+                                              use_starting_offsets=False,
+                                              max_pieces=12)
+
+        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")
+
+        # 16 = [CLS], 17 = [SEP]
+        # 1 full window + 1 half window with start/end tokens
+        assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 17]
+        # We could fit one more piece here, but we don't, not to have a cut
+        # in the middle of the word
+        assert indexed_tokens["bert-offsets"] == [1, 3, 4, 5, 6, 7, 8, 9]
+
+    def test_truncate_window_fit_two_wordpieces(self):
+        """
+        Tests if the both `use_starting_offsets` options work properly when last
+        word in the truncated sentence consists of two wordpieces.
+        """
+
+        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())
+
+        sentence = "the quickest quick brown fox jumped over the quickest dog"
+        tokens = tokenizer.tokenize(sentence)
+
+        vocab = Vocabulary()
+        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
+        token_indexer = PretrainedBertIndexer(str(vocab_path),
+                                              truncate_long_sequences=True,
+                                              use_starting_offsets=True,
+                                              max_pieces=13)
+
+        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")
+
+        # 16 = [CLS], 17 = [SEP]
+        # 1 full window + 1 half window with start/end tokens
+        assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 3, 4, 17]
+        assert indexed_tokens["bert-offsets"] == [1, 2, 4, 5, 6, 7, 8, 9, 10]
+        assert indexed_tokens["bert-type-ids"] == [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+
+        token_indexer = PretrainedBertIndexer(str(vocab_path),
+                                              truncate_long_sequences=True,
+                                              use_starting_offsets=False,
+                                              max_pieces=13)
+
+        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")
+
+        # 16 = [CLS], 17 = [SEP]
+        # 1 full window + 1 half window with start/end tokens
+        assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 3, 4, 17]
+        assert indexed_tokens["bert-offsets"] == [1, 3, 4, 5, 6, 7, 8, 9, 11]