@@ -229,14 +229,15 @@ def _get_part_names(self):
229
229
return ("pytorch_model.bin" ,)
230
230
return (f"pytorch_model-{ n :05} -of-{ self .num_parts :05} .bin" for n in range (1 , self .num_parts + 1 ))
231
231
232
- # used for GPT-2 BPE and WordPiece vocabs
233
- def get_vocab_base (self ) -> tuple [list [str ], list [int ], str ]:
232
+ def _set_vocab_gpt2 (self ):
233
+ dir_model = self .dir_model
234
+ hparams = self .hparams
234
235
tokens : list [str ] = []
235
236
toktypes : list [int ] = []
236
237
237
238
from transformers import AutoTokenizer
238
- tokenizer = AutoTokenizer .from_pretrained (self . dir_model )
239
- vocab_size = self . hparams .get ("vocab_size" , len (tokenizer .vocab ))
239
+ tokenizer = AutoTokenizer .from_pretrained (dir_model )
240
+ vocab_size = hparams .get ("vocab_size" , len (tokenizer .vocab ))
240
241
assert max (tokenizer .vocab .values ()) < vocab_size
241
242
242
243
tokpre = self .get_vocab_base_pre (tokenizer )
@@ -258,79 +259,12 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
258
259
tokens .append (reverse_vocab [i ])
259
260
toktypes .append (gguf .TokenType .NORMAL )
260
261
261
- return tokens , toktypes , tokpre
262
-
263
- # NOTE: this function is generated by convert-hf-to-gguf-update.py
264
- # do not modify it manually!
265
- # ref: https://github.com/ggerganov/llama.cpp/pull/6920
266
- def get_vocab_base_pre (self , tokenizer ) -> str :
267
- # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
268
- # is specific for the BPE pre-tokenizer used by the model
269
- # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
270
- # use in llama.cpp to implement the same pre-tokenizer
271
-
272
- chktxt = '\n \n \n \n \n \n \t \t \t \t \n \n \n \n \n 🚀 (normal) 😶\u200d 🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \' \' \' \' \' \' ```````""""......!!!!!!?????? I\' ve been \' told he\' s there, \' RE you sure? \' M not sure I\' ll make it, \' D you like some tea? We\' Ve a\' lL'
273
-
274
- chktok = tokenizer .encode (chktxt )
275
- chkhsh = sha256 (str (chktok ).encode ()).hexdigest ()
276
-
277
- print (f"chktok: { chktok } " )
278
- print (f"chkhsh: { chkhsh } " )
279
-
280
- res = None
281
-
282
- # NOTE: if you get an error here, you need to add the model to the if-elif chain below
283
- # don't do this manually - use the convert-hf-to-gguf-update.py script!
284
- if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5" :
285
- # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
286
- res = "llama-bpe"
287
- if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754" :
288
- # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base
289
- res = "deepseek-llm"
290
- if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821" :
291
- # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base
292
- res = "deepseek-coder"
293
- if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed" :
294
- # ref: https://huggingface.co/tiiuae/falcon-7b
295
- res = "falcon"
296
- if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f" :
297
- # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
298
- res = "bert-bge"
299
- if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166" :
300
- # ref: https://huggingface.co/mosaicml/mpt-7b
301
- res = "mpt"
302
- if chkhsh == "35d91631860c815f952d711435f48d356ebac988362536bed955d43bfa436e34" :
303
- # ref: https://huggingface.co/bigcode/starcoder2-3b
304
- res = "starcoder"
305
- if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454" :
306
- # ref: https://huggingface.co/openai-community/gpt2
307
- res = "gpt-2"
308
-
309
- if res is None :
310
- print ("\n " )
311
- print ("**************************************************************************************" )
312
- print ("** WARNING: The BPE pre-tokenizer was not recognized!" )
313
- print ("** This means that it was not added yet or you are using an older version." )
314
- print ("** Check convert-hf-to-gguf-update.py and update it accordingly." )
315
- print ("**" )
316
- print (f"** chkhsh: { chkhsh } " )
317
- print ("**************************************************************************************" )
318
- print ("\n " )
319
- raise NotImplementedError ("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()" )
320
-
321
- print (f"tokenizer.ggml.pre: { res } " )
322
- print (f"chkhsh: { chkhsh } " )
323
-
324
- return res
325
-
326
- def _set_vocab_gpt2 (self ) -> None :
327
- tokens , toktypes , tokpre = self .get_vocab_base ()
328
262
self .gguf_writer .add_tokenizer_model ("gpt2" )
329
263
self .gguf_writer .add_tokenizer_pre (tokpre )
330
264
self .gguf_writer .add_token_list (tokens )
331
265
self .gguf_writer .add_token_types (toktypes )
332
266
333
- special_vocab = gguf .SpecialVocab (self . dir_model , load_merges = True )
267
+ special_vocab = gguf .SpecialVocab (dir_model , load_merges = True )
334
268
special_vocab .add_to_gguf (self .gguf_writer )
335
269
336
270
def _set_vocab_qwen (self ):
@@ -2523,26 +2457,35 @@ def set_gguf_parameters(self):
2523
2457
self .gguf_writer .add_pooling_type (pooling_type )
2524
2458
2525
2459
def set_vocab (self ):
2526
- tokens , toktypes , tokpre = self .get_vocab_base ()
2527
- self .vocab_size = len (tokens )
2460
+ # use huggingface vocab to get all tokens
2461
+ vocab = LlamaHfVocab (self .dir_model , ignore_nonllama = True )
2462
+ tokens , scores , toktypes = zip (* vocab .all_tokens ())
2463
+ assert len (tokens ) == vocab .vocab_size
2464
+ self .vocab_size = vocab .vocab_size
2528
2465
2529
2466
# we need this to validate the size of the token_type embeddings
2530
2467
# though currently we are passing all zeros to the token_type embeddings
2531
- self .gguf_writer .add_token_type_count (2 ) # "Sequence A" or "Sequence B"
2468
+ n_token_types = len (set (toktypes ))
2469
+ self .gguf_writer .add_token_type_count (n_token_types )
2532
2470
2533
2471
# convert to phantom space vocab
2534
- def phantom (tok ):
2535
- if tok .startswith ("[" ) and tok .endswith ("]" ):
2472
+ def phantom (tok , typ ):
2473
+ if tok .startswith (b "[" ) and tok .endswith (b "]" ):
2536
2474
return tok
2537
- if tok .startswith ("##" ):
2475
+ if tok .startswith (b "##" ):
2538
2476
return tok [2 :]
2539
- return "\u2581 " + tok
2540
- tokens = list (map (phantom , tokens ))
2477
+ return b"\xe2 \x96 \x81 " + tok
2478
+ tokens = tuple (phantom (t , y ) for t , y in zip (tokens , toktypes ))
2479
+
2480
+ # set up bos and eos tokens (cls and sep)
2481
+ self .gguf_writer .add_bos_token_id (vocab .tokenizer .cls_token_id )
2482
+ self .gguf_writer .add_eos_token_id (vocab .tokenizer .sep_token_id )
2541
2483
2542
2484
# add vocab to gguf
2543
2485
self .gguf_writer .add_tokenizer_model ("bert" )
2544
2486
self .gguf_writer .add_tokenizer_pre (tokpre )
2545
2487
self .gguf_writer .add_token_list (tokens )
2488
+ self .gguf_writer .add_token_scores (scores )
2546
2489
self .gguf_writer .add_token_types (toktypes )
2547
2490
2548
2491
# handle special tokens
@@ -2618,6 +2561,16 @@ def set_gguf_parameters(self):
2618
2561
super ().set_gguf_parameters ()
2619
2562
self .gguf_writer .add_rope_freq_base (self .hparams ["rotary_emb_base" ])
2620
2563
2564
+ def get_tensors (self ):
2565
+ assert self .vocab_size is not None
2566
+ for name , data in super ().get_tensors ():
2567
+ # Nomic Embed's token embeddings tensor is padded, but llama.cpp wants tensor sizes to match exactly.
2568
+ if name == 'embeddings.word_embeddings.weight' and data .shape [1 ] != self .vocab_size :
2569
+ rounded_vocab_size = (self .vocab_size + 63 ) // 64 * 64
2570
+ assert data .shape == (rounded_vocab_size , self .hparams ["n_embd" ])
2571
+ data = data [:self .vocab_size , :]
2572
+ yield name , data
2573
+
2621
2574
2622
2575
@Model .register ("GemmaForCausalLM" )
2623
2576
class GemmaModel (Model ):
@@ -2818,8 +2771,7 @@ def write_tensors(self):
2818
2771
data = data .astype (np .float32 )
2819
2772
2820
2773
# if f16 desired, convert big float32 2-dim weight tensors to float16
2821
- new_weight_name = new_name [:- len (".weight" )] if new_name .endswith (".weight" ) else ""
2822
- if self .ftype == 1 and data_dtype == np .float32 and new_weight_name .endswith ((".ssm_in" , ".ssm_out" , "token_embd" , "output" )) and n_dims == 2 :
2774
+ if self .ftype == 1 and data_dtype == np .float32 and new_name .removesuffix (".weight" ).endswith ((".ssm_in" , ".ssm_out" , "token_embd" , "output" )) and n_dims == 2 :
2823
2775
data = data .astype (np .float16 )
2824
2776
2825
2777
print (f"{ new_name } , n_dims = { n_dims } , { old_dtype } --> { data .dtype } " )
0 commit comments