vvhg1
diff --git a/‎common/common.cpp
Lines changed: 8 additions & 8 deletions b/‎common/common.cpp
Lines changed: 8 additions & 8 deletions
diff --git a/‎common/common.h
Lines changed: 4 additions & 4 deletions b/‎common/common.h
Lines changed: 4 additions & 4 deletions
diff --git a/‎convert-hf-to-gguf.py
Lines changed: 34 additions & 82 deletions b/‎convert-hf-to-gguf.py
Lines changed: 34 additions & 82 deletions
diff --git a/‎convert-persimmon-to-gguf.py
Lines changed: 0 additions & 2 deletions b/‎convert-persimmon-to-gguf.py
Lines changed: 0 additions & 2 deletions
diff --git a/‎convert.py
Lines changed: 11 additions & 17 deletions b/‎convert.py
Lines changed: 11 additions & 17 deletions
diff --git a/‎examples/embedding/embedding.cpp
Lines changed: 3 additions & 3 deletions b/‎examples/embedding/embedding.cpp
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/imatrix/imatrix.cpp
Lines changed: 1 addition & 2 deletions b/‎examples/imatrix/imatrix.cpp
Lines changed: 1 addition & 2 deletions
diff --git a/‎examples/infill/infill.cpp
Lines changed: 2 additions & 3 deletions b/‎examples/infill/infill.cpp
Lines changed: 2 additions & 3 deletions
diff --git a/‎examples/llava/llava-cli.cpp
Lines changed: 2 additions & 1 deletion b/‎examples/llava/llava-cli.cpp
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/lookahead/lookahead.cpp
Lines changed: 4 additions & 1 deletion b/‎examples/lookahead/lookahead.cpp
Lines changed: 4 additions & 1 deletion
diff --git a/‎examples/lookup/lookup-create.cpp
Lines changed: 3 additions & 1 deletion b/‎examples/lookup/lookup-create.cpp
Lines changed: 3 additions & 1 deletion
@@ -2332,23 +2332,23 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
 std::vector<llama_token> llama_tokenize(
   const struct llama_context * ctx,
            const std::string & text,
-                        bool   add_special,
-                        bool   parse_special) {
-    return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special);
+                        bool   add_bos,
+                        bool   special) {
+    return llama_tokenize(llama_get_model(ctx), text, add_bos, special);
 }
 
 std::vector<llama_token> llama_tokenize(
     const struct llama_model * model,
            const std::string & text,
-                        bool   add_special,
-                        bool   parse_special) {
+                        bool   add_bos,
+                        bool   special) {
     // upper limit for the number of tokens
-    int n_tokens = text.length() + 2 * add_special;
+    int n_tokens = text.length() + add_bos;
     std::vector<llama_token> result(n_tokens);
-    n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+    n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
     if (n_tokens < 0) {
         result.resize(-n_tokens);
-        int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+        int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
         GGML_ASSERT(check == -n_tokens);
     } else {
         result.resize(n_tokens);
 
@@ -232,14 +232,14 @@ void llama_batch_add(
 std::vector<llama_token> llama_tokenize(
   const struct llama_context * ctx,
            const std::string & text,
-                        bool   add_special,
-                        bool   parse_special = false);
+                        bool   add_bos,
+                        bool   special = false);
 
 std::vector<llama_token> llama_tokenize(
     const struct llama_model * model,
            const std::string & text,
-                        bool   add_special,
-                        bool   parse_special = false);
+                        bool   add_bos,
+                        bool   special = false);
 
 // tokenizes a token into a piece, optionally renders special/control tokens
 // should work similar to Python's `tokenizer.id_to_piece`
 
@@ -229,14 +229,15 @@ def _get_part_names(self):
             return ("pytorch_model.bin",)
         return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
 
-    # used for GPT-2 BPE and WordPiece vocabs
-    def get_vocab_base(self) -> tuple[list[str], list[int], str]:
+    def _set_vocab_gpt2(self):
+        dir_model = self.dir_model
+        hparams = self.hparams
         tokens: list[str] = []
         toktypes: list[int] = []
 
         from transformers import AutoTokenizer
-        tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
-        vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
+        tokenizer = AutoTokenizer.from_pretrained(dir_model)
+        vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
         assert max(tokenizer.vocab.values()) < vocab_size
 
         tokpre = self.get_vocab_base_pre(tokenizer)
@@ -258,79 +259,12 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
                 tokens.append(reverse_vocab[i])
                 toktypes.append(gguf.TokenType.NORMAL)
 
-        return tokens, toktypes, tokpre
-
-    # NOTE: this function is generated by convert-hf-to-gguf-update.py
-    #       do not modify it manually!
-    # ref:  https://github.com/ggerganov/llama.cpp/pull/6920
-    def get_vocab_base_pre(self, tokenizer) -> str:
-        # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
-        # is specific for the BPE pre-tokenizer used by the model
-        # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
-        # use in llama.cpp to implement the same pre-tokenizer
-
-        chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
-
-        chktok = tokenizer.encode(chktxt)
-        chkhsh = sha256(str(chktok).encode()).hexdigest()
-
-        print(f"chktok: {chktok}")
-        print(f"chkhsh: {chkhsh}")
-
-        res = None
-
-        # NOTE: if you get an error here, you need to add the model to the if-elif chain below
-        #       don't do this manually - use the convert-hf-to-gguf-update.py script!
-        if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
-            # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
-            res = "llama-bpe"
-        if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754":
-            # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base
-            res = "deepseek-llm"
-        if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821":
-            # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base
-            res = "deepseek-coder"
-        if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
-            # ref: https://huggingface.co/tiiuae/falcon-7b
-            res = "falcon"
-        if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
-            # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
-            res = "bert-bge"
-        if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
-            # ref: https://huggingface.co/mosaicml/mpt-7b
-            res = "mpt"
-        if chkhsh == "35d91631860c815f952d711435f48d356ebac988362536bed955d43bfa436e34":
-            # ref: https://huggingface.co/bigcode/starcoder2-3b
-            res = "starcoder"
-        if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
-            # ref: https://huggingface.co/openai-community/gpt2
-            res = "gpt-2"
-
-        if res is None:
-            print("\n")
-            print("**************************************************************************************")
-            print("** WARNING: The BPE pre-tokenizer was not recognized!")
-            print("**          This means that it was not added yet or you are using an older version.")
-            print("**          Check convert-hf-to-gguf-update.py and update it accordingly.")
-            print("**")
-            print(f"** chkhsh:  {chkhsh}")
-            print("**************************************************************************************")
-            print("\n")
-            raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
-
-        print(f"tokenizer.ggml.pre: {res}")
-        print(f"chkhsh: {chkhsh}")
-
-        return res
-
-    def _set_vocab_gpt2(self) -> None:
-        tokens, toktypes, tokpre = self.get_vocab_base()
         self.gguf_writer.add_tokenizer_model("gpt2")
         self.gguf_writer.add_tokenizer_pre(tokpre)
         self.gguf_writer.add_token_list(tokens)
         self.gguf_writer.add_token_types(toktypes)
 
-        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
+        special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
         special_vocab.add_to_gguf(self.gguf_writer)
 
     def _set_vocab_qwen(self):
@@ -2523,26 +2457,35 @@ def set_gguf_parameters(self):
             self.gguf_writer.add_pooling_type(pooling_type)
 
     def set_vocab(self):
-        tokens, toktypes, tokpre = self.get_vocab_base()
-        self.vocab_size = len(tokens)
+        # use huggingface vocab to get all tokens
+        vocab = LlamaHfVocab(self.dir_model, ignore_nonllama=True)
+        tokens, scores, toktypes = zip(*vocab.all_tokens())
+        assert len(tokens) == vocab.vocab_size
+        self.vocab_size = vocab.vocab_size
 
         # we need this to validate the size of the token_type embeddings
         # though currently we are passing all zeros to the token_type embeddings
-        self.gguf_writer.add_token_type_count(2)  # "Sequence A" or "Sequence B"
+        n_token_types = len(set(toktypes))
+        self.gguf_writer.add_token_type_count(n_token_types)
 
         # convert to phantom space vocab
-        def phantom(tok):
-            if tok.startswith("[") and tok.endswith("]"):
+        def phantom(tok, typ):
+            if tok.startswith(b"[") and tok.endswith(b"]"):
                 return tok
-            if tok.startswith("##"):
+            if tok.startswith(b"##"):
                 return tok[2:]
-            return "\u2581" + tok
-        tokens = list(map(phantom, tokens))
+            return b"\xe2\x96\x81" + tok
+        tokens = tuple(phantom(t, y) for t, y in zip(tokens, toktypes))
+
+        # set up bos and eos tokens (cls and sep)
+        self.gguf_writer.add_bos_token_id(vocab.tokenizer.cls_token_id)
+        self.gguf_writer.add_eos_token_id(vocab.tokenizer.sep_token_id)
 
         # add vocab to gguf
         self.gguf_writer.add_tokenizer_model("bert")
         self.gguf_writer.add_tokenizer_pre(tokpre)
         self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_scores(scores)
         self.gguf_writer.add_token_types(toktypes)
 
         # handle special tokens
@@ -2618,6 +2561,16 @@ def set_gguf_parameters(self):
         super().set_gguf_parameters()
         self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
 
+    def get_tensors(self):
+        assert self.vocab_size is not None
+        for name, data in super().get_tensors():
+            # Nomic Embed's token embeddings tensor is padded, but llama.cpp wants tensor sizes to match exactly.
+            if name == 'embeddings.word_embeddings.weight' and data.shape[1] != self.vocab_size:
+                rounded_vocab_size = (self.vocab_size + 63) // 64 * 64
+                assert data.shape == (rounded_vocab_size, self.hparams["n_embd"])
+                data = data[:self.vocab_size, :]
+            yield name, data
+
 
 @Model.register("GemmaForCausalLM")
 class GemmaModel(Model):
@@ -2818,8 +2771,7 @@ def write_tensors(self):
                 data = data.astype(np.float32)
 
             # if f16 desired, convert big float32 2-dim weight tensors to float16
-            new_weight_name = new_name[:-len(".weight")] if new_name.endswith(".weight") else ""
-            if self.ftype == 1 and data_dtype == np.float32 and new_weight_name.endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
+            if self.ftype == 1 and data_dtype == np.float32 and new_name.removesuffix(".weight").endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
                 data = data.astype(np.float16)
 
             print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
 
@@ -1,6 +1,4 @@
 #!/usr/bin/env python3
-from __future__ import annotations
-
 import argparse
 import os
 import sys
 
@@ -33,7 +33,7 @@
 import gguf
 
 if TYPE_CHECKING:
-    from typing_extensions import Self, TypeAlias
+    from typing import TypeAlias
 
 if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
     faulthandler.register(signal.SIGUSR1)
@@ -517,22 +517,17 @@ class LlamaHfVocab(Vocab):
     tokenizer_model = "llama"
     name = "hfft"
 
-    def __init__(self, base_path: Path):
+    def __init__(self, base_path: Path, ignore_nonllama: bool = False):
         fname_tokenizer = base_path / FAST_TOKENIZER_FILE
         # if this fails, FileNotFoundError propagates to caller
         with open(fname_tokenizer, encoding='utf-8') as f:
             tokenizer_json = json.load(f)
 
         # pre-check so we know if we need transformers
         tokenizer_model: dict[str, Any] = tokenizer_json['model']
-        is_llama3 = (
-            tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
-            and not tokenizer_model.get('byte_fallback', True)
-        )
-        if is_llama3:
-            raise TypeError('Llama 3 must be converted with BpeVocab')
-
-        if not is_llama3 and (
+        if ignore_nonllama:
+            pass  # workaround incorrect use of this class for WordPiece
+        elif (
             tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
             or tokenizer_json['decoder']['type'] != 'Sequence'
         ):
@@ -652,17 +647,16 @@ def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
 
 
 class Tensor(ABC):
-    ndarray: NDArray
     data_type: DataType
 
     @abstractmethod
-    def astype(self, data_type: DataType) -> Self: ...
+    def astype(self, data_type: DataType) -> Tensor: ...
     @abstractmethod
-    def permute(self, n_head: int, n_head_kv: int) -> Self: ...
+    def permute(self, n_head: int, n_head_kv: int) -> Tensor: ...
     @abstractmethod
-    def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> Self: ...
+    def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: ...
     @abstractmethod
-    def part(self, n_part: int) -> Self: ...
+    def part(self, n_part: int) -> UnquantizedTensor: ...
     @abstractmethod
     def to_ggml(self) -> GGMLCompatibleTensor: ...
 
@@ -679,13 +673,13 @@ def __init__(self, ndarray: NDArray):
         self.ndarray = ndarray
         self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
 
-    def astype(self, data_type: DataType) -> UnquantizedTensor:
+    def astype(self, data_type: DataType) -> Tensor:
         dtype = data_type.dtype
         if self.data_type == DT_BF16:
             self.ndarray = bf16_to_fp32(self.ndarray)
         return UnquantizedTensor(self.ndarray.astype(dtype))
 
-    def to_ggml(self) -> Self:
+    def to_ggml(self) -> UnquantizedTensor:
         return self
 
     def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor:
 
@@ -125,10 +125,10 @@ int main(int argc, char ** argv) {
         inputs.push_back(inp);
     }
 
-    // add SEP if not present
+    // add eos if not present
     for (auto & inp : inputs) {
-        if (inp.empty() || inp.back() != llama_token_sep(model)) {
-            inp.push_back(llama_token_sep(model));
+        if (inp.empty() || inp.back() != llama_token_eos(model)) {
+            inp.push_back(llama_token_eos(model));
         }
     }
 
 
@@ -372,13 +372,12 @@ static void process_logits(
 static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
 
     const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
-    GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
     const int n_ctx = llama_n_ctx(ctx);
 
     auto tim1 = std::chrono::high_resolution_clock::now();
     fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
 
-    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
+    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
 
     auto tim2 = std::chrono::high_resolution_clock::now();
     fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
 
@@ -239,7 +239,6 @@ int main(int argc, char ** argv) {
         LOG_TEE("%s\n", get_system_info(params).c_str());
     }
     const bool add_bos = llama_should_add_bos_token(model);
-    GGML_ASSERT(llama_add_eos_token(model) != 1);
     LOG("add_bos: %d\n", add_bos);
 
     bool suff_rm_leading_spc = params.escape;
@@ -280,10 +279,10 @@ int main(int argc, char ** argv) {
     if (ctx_guidance) {
         LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
 
-        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true);
+        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
         LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
 
-        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
+        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
         LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
 
         original_prompt_len = original_inp.size();
 
@@ -147,6 +147,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
     int n_past = 0;
 
     const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
+    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama));
 
     std::string system_prompt, user_prompt;
     size_t image_pos = prompt.find("<image>");
@@ -180,7 +181,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
         }
     }
 
-    eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
+    eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos);
     llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
     eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
 
 
@@ -64,10 +64,13 @@ int main(int argc, char ** argv) {
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
 
     // Tokenize the prompt
+    const bool add_bos = llama_should_add_bos_token(model);
+    LOG("add_bos tgt: %d\n", add_bos);
+
     std::vector<llama_token> inp;
     std::vector<llama_token> all;
 
-    inp = ::llama_tokenize(ctx, params.prompt, true, true);
+    inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
     all = inp;
 
     const int max_context_size     = llama_n_ctx(ctx);
 
@@ -28,8 +28,10 @@ int main(int argc, char ** argv){
     GGML_ASSERT(model != nullptr);
 
     // tokenize the prompt
+    const bool add_bos = llama_should_add_bos_token(model);
+
     std::vector<llama_token> inp;
-    inp = ::llama_tokenize(ctx, params.prompt, true, true);
+    inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
     fprintf(stderr, "%s: tokenization done\n", __func__);
Original file line number	Diff line number	Diff line change
`@@ -125,10 +125,10 @@ int main(int argc, char ** argv) {`
`125`	`125`	`inputs.push_back(inp);`
`126`	`126`	`}`
`127`	`127`
`128`		`- // add SEP if not present`
	`128`	`+ // add eos if not present`
`129`	`129`	`for (auto & inp : inputs) {`
`130`		`- if (inp.empty() \|\| inp.back() != llama_token_sep(model)) {`
`131`		`- inp.push_back(llama_token_sep(model));`
	`130`	`+ if (inp.empty() \|\| inp.back() != llama_token_eos(model)) {`
	`131`	`+ inp.push_back(llama_token_eos(model));`
`132`	`132`	`}`
`133`	`133`	`}`
`134`	`134`
Original file line number	Diff line number	Diff line change
`@@ -147,6 +147,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_`
`147`	`147`	`int n_past = 0;`
`148`	`148`
`149`	`149`	`const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;`
	`150`	`+ const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama));`
`150`	`151`
`151`	`152`	`std::string system_prompt, user_prompt;`
`152`	`153`	`size_t image_pos = prompt.find("<image>");`
`@@ -180,7 +181,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_`
`180`	`181`	`}`
`181`	`182`	`}`
`182`	`183`
`183`		`- eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);`
	`184`	`+ eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos);`
`184`	`185`	`llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);`
`185`	`186`	`eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);`
`186`	`187`