@@ -203,3 +203,83 @@ def test_truncate_window(self):
203
203
# 1 full window + 1 half window with start/end tokens
204
204
assert indexed_tokens ["bert" ] == [16 , 2 , 3 , 4 , 3 , 5 , 6 , 8 , 9 , 17 ]
205
205
assert indexed_tokens ["bert-offsets" ] == [1 , 3 , 4 , 5 , 6 , 7 , 8 ]
206
+
207
+ def test_truncate_window_dont_split_wordpieces (self ):
208
+ """
209
+ Tests if the sentence is not truncated inside of the word with 2 or
210
+ more wordpieces.
211
+ """
212
+
213
+ tokenizer = WordTokenizer (word_splitter = BertBasicWordSplitter ())
214
+
215
+ sentence = "the quickest quick brown fox jumped over the quickest dog"
216
+ tokens = tokenizer .tokenize (sentence )
217
+
218
+ vocab = Vocabulary ()
219
+ vocab_path = self .FIXTURES_ROOT / 'bert' / 'vocab.txt'
220
+ token_indexer = PretrainedBertIndexer (str (vocab_path ),
221
+ truncate_long_sequences = True ,
222
+ use_starting_offsets = True ,
223
+ max_pieces = 12 )
224
+
225
+ indexed_tokens = token_indexer .tokens_to_indices (tokens , vocab , "bert" )
226
+
227
+ # 16 = [CLS], 17 = [SEP]
228
+ # 1 full window + 1 half window with start/end tokens
229
+ assert indexed_tokens ["bert" ] == [16 , 2 , 3 , 4 , 3 , 5 , 6 , 8 , 9 , 2 , 17 ]
230
+ # We could fit one more piece here, but we don't, not to have a cut
231
+ # in the middle of the word
232
+ assert indexed_tokens ["bert-offsets" ] == [1 , 2 , 4 , 5 , 6 , 7 , 8 , 9 ]
233
+ assert indexed_tokens ["bert-type-ids" ] == [0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ]
234
+
235
+ token_indexer = PretrainedBertIndexer (str (vocab_path ),
236
+ truncate_long_sequences = True ,
237
+ use_starting_offsets = False ,
238
+ max_pieces = 12 )
239
+
240
+ indexed_tokens = token_indexer .tokens_to_indices (tokens , vocab , "bert" )
241
+
242
+ # 16 = [CLS], 17 = [SEP]
243
+ # 1 full window + 1 half window with start/end tokens
244
+ assert indexed_tokens ["bert" ] == [16 , 2 , 3 , 4 , 3 , 5 , 6 , 8 , 9 , 2 , 17 ]
245
+ # We could fit one more piece here, but we don't, not to have a cut
246
+ # in the middle of the word
247
+ assert indexed_tokens ["bert-offsets" ] == [1 , 3 , 4 , 5 , 6 , 7 , 8 , 9 ]
248
+
249
+ def test_truncate_window_fit_two_wordpieces (self ):
250
+ """
251
+ Tests if the both `use_starting_offsets` options work properly when last
252
+ word in the truncated sentence consists of two wordpieces.
253
+ """
254
+
255
+ tokenizer = WordTokenizer (word_splitter = BertBasicWordSplitter ())
256
+
257
+ sentence = "the quickest quick brown fox jumped over the quickest dog"
258
+ tokens = tokenizer .tokenize (sentence )
259
+
260
+ vocab = Vocabulary ()
261
+ vocab_path = self .FIXTURES_ROOT / 'bert' / 'vocab.txt'
262
+ token_indexer = PretrainedBertIndexer (str (vocab_path ),
263
+ truncate_long_sequences = True ,
264
+ use_starting_offsets = True ,
265
+ max_pieces = 13 )
266
+
267
+ indexed_tokens = token_indexer .tokens_to_indices (tokens , vocab , "bert" )
268
+
269
+ # 16 = [CLS], 17 = [SEP]
270
+ # 1 full window + 1 half window with start/end tokens
271
+ assert indexed_tokens ["bert" ] == [16 , 2 , 3 , 4 , 3 , 5 , 6 , 8 , 9 , 2 , 3 , 4 , 17 ]
272
+ assert indexed_tokens ["bert-offsets" ] == [1 , 2 , 4 , 5 , 6 , 7 , 8 , 9 , 10 ]
273
+ assert indexed_tokens ["bert-type-ids" ] == [0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ]
274
+
275
+ token_indexer = PretrainedBertIndexer (str (vocab_path ),
276
+ truncate_long_sequences = True ,
277
+ use_starting_offsets = False ,
278
+ max_pieces = 13 )
279
+
280
+ indexed_tokens = token_indexer .tokens_to_indices (tokens , vocab , "bert" )
281
+
282
+ # 16 = [CLS], 17 = [SEP]
283
+ # 1 full window + 1 half window with start/end tokens
284
+ assert indexed_tokens ["bert" ] == [16 , 2 , 3 , 4 , 3 , 5 , 6 , 8 , 9 , 2 , 3 , 4 , 17 ]
285
+ assert indexed_tokens ["bert-offsets" ] == [1 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 11 ]
0 commit comments