Skip to content

Commit 82e6483

Browse files
committed
Revert "BERT tokenizer fixes (ggml-org#6498)"
This reverts commit 1b67731.
1 parent b9778d3 commit 82e6483

File tree

20 files changed

+174
-264
lines changed

20 files changed

+174
-264
lines changed

common/common.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2332,23 +2332,23 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
23322332
std::vector<llama_token> llama_tokenize(
23332333
const struct llama_context * ctx,
23342334
const std::string & text,
2335-
bool add_special,
2336-
bool parse_special) {
2337-
return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special);
2335+
bool add_bos,
2336+
bool special) {
2337+
return llama_tokenize(llama_get_model(ctx), text, add_bos, special);
23382338
}
23392339

23402340
std::vector<llama_token> llama_tokenize(
23412341
const struct llama_model * model,
23422342
const std::string & text,
2343-
bool add_special,
2344-
bool parse_special) {
2343+
bool add_bos,
2344+
bool special) {
23452345
// upper limit for the number of tokens
2346-
int n_tokens = text.length() + 2 * add_special;
2346+
int n_tokens = text.length() + add_bos;
23472347
std::vector<llama_token> result(n_tokens);
2348-
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
2348+
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
23492349
if (n_tokens < 0) {
23502350
result.resize(-n_tokens);
2351-
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
2351+
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
23522352
GGML_ASSERT(check == -n_tokens);
23532353
} else {
23542354
result.resize(n_tokens);

common/common.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -232,14 +232,14 @@ void llama_batch_add(
232232
std::vector<llama_token> llama_tokenize(
233233
const struct llama_context * ctx,
234234
const std::string & text,
235-
bool add_special,
236-
bool parse_special = false);
235+
bool add_bos,
236+
bool special = false);
237237

238238
std::vector<llama_token> llama_tokenize(
239239
const struct llama_model * model,
240240
const std::string & text,
241-
bool add_special,
242-
bool parse_special = false);
241+
bool add_bos,
242+
bool special = false);
243243

244244
// tokenizes a token into a piece, optionally renders special/control tokens
245245
// should work similar to Python's `tokenizer.id_to_piece`

convert-hf-to-gguf.py

Lines changed: 34 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -229,14 +229,15 @@ def _get_part_names(self):
229229
return ("pytorch_model.bin",)
230230
return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
231231

232-
# used for GPT-2 BPE and WordPiece vocabs
233-
def get_vocab_base(self) -> tuple[list[str], list[int], str]:
232+
def _set_vocab_gpt2(self):
233+
dir_model = self.dir_model
234+
hparams = self.hparams
234235
tokens: list[str] = []
235236
toktypes: list[int] = []
236237

237238
from transformers import AutoTokenizer
238-
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
239-
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
239+
tokenizer = AutoTokenizer.from_pretrained(dir_model)
240+
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
240241
assert max(tokenizer.vocab.values()) < vocab_size
241242

242243
tokpre = self.get_vocab_base_pre(tokenizer)
@@ -258,79 +259,12 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
258259
tokens.append(reverse_vocab[i])
259260
toktypes.append(gguf.TokenType.NORMAL)
260261

261-
return tokens, toktypes, tokpre
262-
263-
# NOTE: this function is generated by convert-hf-to-gguf-update.py
264-
# do not modify it manually!
265-
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
266-
def get_vocab_base_pre(self, tokenizer) -> str:
267-
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
268-
# is specific for the BPE pre-tokenizer used by the model
269-
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
270-
# use in llama.cpp to implement the same pre-tokenizer
271-
272-
chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
273-
274-
chktok = tokenizer.encode(chktxt)
275-
chkhsh = sha256(str(chktok).encode()).hexdigest()
276-
277-
print(f"chktok: {chktok}")
278-
print(f"chkhsh: {chkhsh}")
279-
280-
res = None
281-
282-
# NOTE: if you get an error here, you need to add the model to the if-elif chain below
283-
# don't do this manually - use the convert-hf-to-gguf-update.py script!
284-
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
285-
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
286-
res = "llama-bpe"
287-
if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754":
288-
# ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base
289-
res = "deepseek-llm"
290-
if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821":
291-
# ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base
292-
res = "deepseek-coder"
293-
if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
294-
# ref: https://huggingface.co/tiiuae/falcon-7b
295-
res = "falcon"
296-
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
297-
# ref: https://huggingface.co/BAAI/bge-small-en-v1.5
298-
res = "bert-bge"
299-
if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
300-
# ref: https://huggingface.co/mosaicml/mpt-7b
301-
res = "mpt"
302-
if chkhsh == "35d91631860c815f952d711435f48d356ebac988362536bed955d43bfa436e34":
303-
# ref: https://huggingface.co/bigcode/starcoder2-3b
304-
res = "starcoder"
305-
if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
306-
# ref: https://huggingface.co/openai-community/gpt2
307-
res = "gpt-2"
308-
309-
if res is None:
310-
print("\n")
311-
print("**************************************************************************************")
312-
print("** WARNING: The BPE pre-tokenizer was not recognized!")
313-
print("** This means that it was not added yet or you are using an older version.")
314-
print("** Check convert-hf-to-gguf-update.py and update it accordingly.")
315-
print("**")
316-
print(f"** chkhsh: {chkhsh}")
317-
print("**************************************************************************************")
318-
print("\n")
319-
raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
320-
321-
print(f"tokenizer.ggml.pre: {res}")
322-
print(f"chkhsh: {chkhsh}")
323-
324-
return res
325-
326-
def _set_vocab_gpt2(self) -> None:
327-
tokens, toktypes, tokpre = self.get_vocab_base()
328262
self.gguf_writer.add_tokenizer_model("gpt2")
329263
self.gguf_writer.add_tokenizer_pre(tokpre)
330264
self.gguf_writer.add_token_list(tokens)
331265
self.gguf_writer.add_token_types(toktypes)
332266

333-
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
267+
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
334268
special_vocab.add_to_gguf(self.gguf_writer)
335269

336270
def _set_vocab_qwen(self):
@@ -2523,26 +2457,35 @@ def set_gguf_parameters(self):
25232457
self.gguf_writer.add_pooling_type(pooling_type)
25242458

25252459
def set_vocab(self):
2526-
tokens, toktypes, tokpre = self.get_vocab_base()
2527-
self.vocab_size = len(tokens)
2460+
# use huggingface vocab to get all tokens
2461+
vocab = LlamaHfVocab(self.dir_model, ignore_nonllama=True)
2462+
tokens, scores, toktypes = zip(*vocab.all_tokens())
2463+
assert len(tokens) == vocab.vocab_size
2464+
self.vocab_size = vocab.vocab_size
25282465

25292466
# we need this to validate the size of the token_type embeddings
25302467
# though currently we are passing all zeros to the token_type embeddings
2531-
self.gguf_writer.add_token_type_count(2) # "Sequence A" or "Sequence B"
2468+
n_token_types = len(set(toktypes))
2469+
self.gguf_writer.add_token_type_count(n_token_types)
25322470

25332471
# convert to phantom space vocab
2534-
def phantom(tok):
2535-
if tok.startswith("[") and tok.endswith("]"):
2472+
def phantom(tok, typ):
2473+
if tok.startswith(b"[") and tok.endswith(b"]"):
25362474
return tok
2537-
if tok.startswith("##"):
2475+
if tok.startswith(b"##"):
25382476
return tok[2:]
2539-
return "\u2581" + tok
2540-
tokens = list(map(phantom, tokens))
2477+
return b"\xe2\x96\x81" + tok
2478+
tokens = tuple(phantom(t, y) for t, y in zip(tokens, toktypes))
2479+
2480+
# set up bos and eos tokens (cls and sep)
2481+
self.gguf_writer.add_bos_token_id(vocab.tokenizer.cls_token_id)
2482+
self.gguf_writer.add_eos_token_id(vocab.tokenizer.sep_token_id)
25412483

25422484
# add vocab to gguf
25432485
self.gguf_writer.add_tokenizer_model("bert")
25442486
self.gguf_writer.add_tokenizer_pre(tokpre)
25452487
self.gguf_writer.add_token_list(tokens)
2488+
self.gguf_writer.add_token_scores(scores)
25462489
self.gguf_writer.add_token_types(toktypes)
25472490

25482491
# handle special tokens
@@ -2618,6 +2561,16 @@ def set_gguf_parameters(self):
26182561
super().set_gguf_parameters()
26192562
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
26202563

2564+
def get_tensors(self):
2565+
assert self.vocab_size is not None
2566+
for name, data in super().get_tensors():
2567+
# Nomic Embed's token embeddings tensor is padded, but llama.cpp wants tensor sizes to match exactly.
2568+
if name == 'embeddings.word_embeddings.weight' and data.shape[1] != self.vocab_size:
2569+
rounded_vocab_size = (self.vocab_size + 63) // 64 * 64
2570+
assert data.shape == (rounded_vocab_size, self.hparams["n_embd"])
2571+
data = data[:self.vocab_size, :]
2572+
yield name, data
2573+
26212574

26222575
@Model.register("GemmaForCausalLM")
26232576
class GemmaModel(Model):
@@ -2818,8 +2771,7 @@ def write_tensors(self):
28182771
data = data.astype(np.float32)
28192772

28202773
# if f16 desired, convert big float32 2-dim weight tensors to float16
2821-
new_weight_name = new_name[:-len(".weight")] if new_name.endswith(".weight") else ""
2822-
if self.ftype == 1 and data_dtype == np.float32 and new_weight_name.endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
2774+
if self.ftype == 1 and data_dtype == np.float32 and new_name.removesuffix(".weight").endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
28232775
data = data.astype(np.float16)
28242776

28252777
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")

convert-persimmon-to-gguf.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
#!/usr/bin/env python3
2-
from __future__ import annotations
3-
42
import argparse
53
import os
64
import sys

convert.py

Lines changed: 11 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
import gguf
3434

3535
if TYPE_CHECKING:
36-
from typing_extensions import Self, TypeAlias
36+
from typing import TypeAlias
3737

3838
if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
3939
faulthandler.register(signal.SIGUSR1)
@@ -517,22 +517,17 @@ class LlamaHfVocab(Vocab):
517517
tokenizer_model = "llama"
518518
name = "hfft"
519519

520-
def __init__(self, base_path: Path):
520+
def __init__(self, base_path: Path, ignore_nonllama: bool = False):
521521
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
522522
# if this fails, FileNotFoundError propagates to caller
523523
with open(fname_tokenizer, encoding='utf-8') as f:
524524
tokenizer_json = json.load(f)
525525

526526
# pre-check so we know if we need transformers
527527
tokenizer_model: dict[str, Any] = tokenizer_json['model']
528-
is_llama3 = (
529-
tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
530-
and not tokenizer_model.get('byte_fallback', True)
531-
)
532-
if is_llama3:
533-
raise TypeError('Llama 3 must be converted with BpeVocab')
534-
535-
if not is_llama3 and (
528+
if ignore_nonllama:
529+
pass # workaround incorrect use of this class for WordPiece
530+
elif (
536531
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
537532
or tokenizer_json['decoder']['type'] != 'Sequence'
538533
):
@@ -652,17 +647,16 @@ def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
652647

653648

654649
class Tensor(ABC):
655-
ndarray: NDArray
656650
data_type: DataType
657651

658652
@abstractmethod
659-
def astype(self, data_type: DataType) -> Self: ...
653+
def astype(self, data_type: DataType) -> Tensor: ...
660654
@abstractmethod
661-
def permute(self, n_head: int, n_head_kv: int) -> Self: ...
655+
def permute(self, n_head: int, n_head_kv: int) -> Tensor: ...
662656
@abstractmethod
663-
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> Self: ...
657+
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: ...
664658
@abstractmethod
665-
def part(self, n_part: int) -> Self: ...
659+
def part(self, n_part: int) -> UnquantizedTensor: ...
666660
@abstractmethod
667661
def to_ggml(self) -> GGMLCompatibleTensor: ...
668662

@@ -679,13 +673,13 @@ def __init__(self, ndarray: NDArray):
679673
self.ndarray = ndarray
680674
self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
681675

682-
def astype(self, data_type: DataType) -> UnquantizedTensor:
676+
def astype(self, data_type: DataType) -> Tensor:
683677
dtype = data_type.dtype
684678
if self.data_type == DT_BF16:
685679
self.ndarray = bf16_to_fp32(self.ndarray)
686680
return UnquantizedTensor(self.ndarray.astype(dtype))
687681

688-
def to_ggml(self) -> Self:
682+
def to_ggml(self) -> UnquantizedTensor:
689683
return self
690684

691685
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor:

examples/embedding/embedding.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -125,10 +125,10 @@ int main(int argc, char ** argv) {
125125
inputs.push_back(inp);
126126
}
127127

128-
// add SEP if not present
128+
// add eos if not present
129129
for (auto & inp : inputs) {
130-
if (inp.empty() || inp.back() != llama_token_sep(model)) {
131-
inp.push_back(llama_token_sep(model));
130+
if (inp.empty() || inp.back() != llama_token_eos(model)) {
131+
inp.push_back(llama_token_eos(model));
132132
}
133133
}
134134

examples/imatrix/imatrix.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -372,13 +372,12 @@ static void process_logits(
372372
static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
373373

374374
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
375-
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
376375
const int n_ctx = llama_n_ctx(ctx);
377376

378377
auto tim1 = std::chrono::high_resolution_clock::now();
379378
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
380379

381-
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
380+
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
382381

383382
auto tim2 = std::chrono::high_resolution_clock::now();
384383
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());

examples/infill/infill.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,6 @@ int main(int argc, char ** argv) {
239239
LOG_TEE("%s\n", get_system_info(params).c_str());
240240
}
241241
const bool add_bos = llama_should_add_bos_token(model);
242-
GGML_ASSERT(llama_add_eos_token(model) != 1);
243242
LOG("add_bos: %d\n", add_bos);
244243

245244
bool suff_rm_leading_spc = params.escape;
@@ -280,10 +279,10 @@ int main(int argc, char ** argv) {
280279
if (ctx_guidance) {
281280
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
282281

283-
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true);
282+
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
284283
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
285284

286-
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
285+
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
287286
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
288287

289288
original_prompt_len = original_inp.size();

examples/llava/llava-cli.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
147147
int n_past = 0;
148148

149149
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
150+
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama));
150151

151152
std::string system_prompt, user_prompt;
152153
size_t image_pos = prompt.find("<image>");
@@ -180,7 +181,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
180181
}
181182
}
182183

183-
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
184+
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos);
184185
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
185186
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
186187

examples/lookahead/lookahead.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,13 @@ int main(int argc, char ** argv) {
6464
std::tie(model, ctx) = llama_init_from_gpt_params(params);
6565

6666
// Tokenize the prompt
67+
const bool add_bos = llama_should_add_bos_token(model);
68+
LOG("add_bos tgt: %d\n", add_bos);
69+
6770
std::vector<llama_token> inp;
6871
std::vector<llama_token> all;
6972

70-
inp = ::llama_tokenize(ctx, params.prompt, true, true);
73+
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
7174
all = inp;
7275

7376
const int max_context_size = llama_n_ctx(ctx);

examples/lookup/lookup-create.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,10 @@ int main(int argc, char ** argv){
2828
GGML_ASSERT(model != nullptr);
2929

3030
// tokenize the prompt
31+
const bool add_bos = llama_should_add_bos_token(model);
32+
3133
std::vector<llama_token> inp;
32-
inp = ::llama_tokenize(ctx, params.prompt, true, true);
34+
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
3335
fprintf(stderr, "%s: tokenization done\n", __func__);
3436

3537

0 commit comments

Comments
 (0)