teleprint-me
diff --git a/‎common/common.cpp
Lines changed: 5 additions & 0 deletions b/‎common/common.cpp
Lines changed: 5 additions & 0 deletions
diff --git a/‎convert_hf_to_gguf.py
Lines changed: 0 additions & 5 deletions b/‎convert_hf_to_gguf.py
Lines changed: 0 additions & 5 deletions
diff --git a/‎docs/development/HOWTO-add-model.md
Lines changed: 8 additions & 8 deletions b/‎docs/development/HOWTO-add-model.md
Lines changed: 8 additions & 8 deletions
diff --git a/‎docs/development/token_generation_performance_tips.md
Lines changed: 1 addition & 1 deletion b/‎docs/development/token_generation_performance_tips.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/pydantic_models_to_grammar.py
Lines changed: 43 additions & 34 deletions b/‎examples/pydantic_models_to_grammar.py
Lines changed: 43 additions & 34 deletions
diff --git a/‎examples/pydantic_models_to_grammar_examples.py
Lines changed: 2 additions & 0 deletions b/‎examples/pydantic_models_to_grammar_examples.py
Lines changed: 2 additions & 0 deletions
@@ -797,6 +797,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.cont_batching = true;
         return true;
     }
+    if (arg == "-nocb" || arg == "--no-cont-batching") {
+        params.cont_batching = false;
+        return true;
+    }
     if (arg == "-fa" || arg == "--flash-attn") {
         params.flash_attn = true;
         return true;
@@ -1538,6 +1542,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "*",           "-np,   --parallel N",           "number of parallel sequences to decode (default: %d)", params.n_parallel });
     options.push_back({ "*",           "-ns,   --sequences N",          "number of sequences to decode (default: %d)", params.n_sequences });
     options.push_back({ "*",           "-cb,   --cont-batching",        "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" });
+    options.push_back({ "*",           "-nocb, --no-cont-batching",     "disable continuous batching" });
 
     options.push_back({ "multi-modality" });
     options.push_back({ "*",           "       --mmproj FILE",          "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
 
@@ -2504,11 +2504,6 @@ def set_gguf_parameters(self):
         )
         self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
 
-        # sanity check
-        attn_scalar = self.hparams["query_pre_attn_scalar"]
-        if attn_scalar != hparams["hidden_size"] / hparams["num_attention_heads"]:
-            raise ValueError("query_pre_attn_scalar must be equal to n_embd / n_head")
-
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         del bid  # unused
 
 
@@ -9,15 +9,15 @@ Adding a model requires few steps:
 After following these steps, you can open PR.
 
 Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
-- [main](../examples/main)
-- [imatrix](../examples/imatrix)
-- [quantize](../examples/quantize)
-- [server](../examples/server)
+- [main](/examples/main/)
+- [imatrix](/examples/imatrix/)
+- [quantize](/examples/quantize/)
+- [server](/examples/server/)
 
 ### 1. Convert the model to GGUF
 
 This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
-Depending on the model architecture, you can use either [convert_hf_to_gguf.py](../convert_hf_to_gguf.py) or [examples/convert_legacy_llama.py](../examples/convert_legacy_llama.py) (for `llama/llama2` models in `.pth` format).
+Depending on the model architecture, you can use either [convert_hf_to_gguf.py](/convert_hf_to_gguf.py) or [examples/convert_legacy_llama.py](/examples/convert_legacy_llama.py) (for `llama/llama2` models in `.pth` format).
 
 The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
 
@@ -31,7 +31,7 @@ class MyModel(Model):
     model_arch = gguf.MODEL_ARCH.GROK
 ```
 
-2. Define the layout of the GGUF tensors in [constants.py](../gguf-py/gguf/constants.py)
+2. Define the layout of the GGUF tensors in [constants.py](/gguf-py/gguf/constants.py)
 
 Add an enum entry in `MODEL_ARCH`, the model human friendly name in `MODEL_ARCH_NAMES` and the GGUF tensor names in `MODEL_TENSORS`.
 
@@ -54,7 +54,7 @@ Example for `falcon` model:
 
 As a general rule, before adding a new tensor name to GGUF, be sure the equivalent naming does not already exist.
 
-Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](../gguf-py/gguf/tensor_mapping.py) file.
+Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](/gguf-py/gguf/tensor_mapping.py) file.
 
 If the tensor name is part of a repetitive layer/block, the key word `bid` substitutes it.
 
@@ -100,7 +100,7 @@ Have a look at existing implementation like `build_llama`, `build_dbrx` or `buil
 
 When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
 
-Note: to debug the inference graph: you can use [llama-eval-callback](../examples/eval-callback).
+Note: to debug the inference graph: you can use [llama-eval-callback](/examples/eval-callback/).
 
 ## GGUF specification
 
 
@@ -1,7 +1,7 @@
 # Token generation performance troubleshooting
 
 ## Verifying that the model is running on the GPU with CUDA
-Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
+Make sure you compiled llama with the correct env variables according to [this guide](/docs/build.md#cuda), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
 ```shell
 ./llama-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
 ```
 
@@ -6,7 +6,7 @@
 from copy import copy
 from enum import Enum
 from inspect import getdoc, isclass
-from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin
+from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin, get_type_hints
 
 from docstring_parser import parse
 from pydantic import BaseModel, create_model
@@ -53,35 +53,38 @@ class PydanticDataType(Enum):
 
 
 def map_pydantic_type_to_gbnf(pydantic_type: type[Any]) -> str:
-    if isclass(pydantic_type) and issubclass(pydantic_type, str):
+    origin_type = get_origin(pydantic_type)
+    origin_type = pydantic_type if origin_type is None else origin_type
+
+    if isclass(origin_type) and issubclass(origin_type, str):
         return PydanticDataType.STRING.value
-    elif isclass(pydantic_type) and issubclass(pydantic_type, bool):
+    elif isclass(origin_type) and issubclass(origin_type, bool):
         return PydanticDataType.BOOLEAN.value
-    elif isclass(pydantic_type) and issubclass(pydantic_type, int):
+    elif isclass(origin_type) and issubclass(origin_type, int):
         return PydanticDataType.INTEGER.value
-    elif isclass(pydantic_type) and issubclass(pydantic_type, float):
+    elif isclass(origin_type) and issubclass(origin_type, float):
         return PydanticDataType.FLOAT.value
-    elif isclass(pydantic_type) and issubclass(pydantic_type, Enum):
+    elif isclass(origin_type) and issubclass(origin_type, Enum):
         return PydanticDataType.ENUM.value
 
-    elif isclass(pydantic_type) and issubclass(pydantic_type, BaseModel):
-        return format_model_and_field_name(pydantic_type.__name__)
-    elif get_origin(pydantic_type) is list:
+    elif isclass(origin_type) and issubclass(origin_type, BaseModel):
+        return format_model_and_field_name(origin_type.__name__)
+    elif origin_type is list:
         element_type = get_args(pydantic_type)[0]
         return f"{map_pydantic_type_to_gbnf(element_type)}-list"
-    elif get_origin(pydantic_type) is set:
+    elif origin_type is set:
         element_type = get_args(pydantic_type)[0]
         return f"{map_pydantic_type_to_gbnf(element_type)}-set"
-    elif get_origin(pydantic_type) is Union:
+    elif origin_type is Union:
         union_types = get_args(pydantic_type)
         union_rules = [map_pydantic_type_to_gbnf(ut) for ut in union_types]
         return f"union-{'-or-'.join(union_rules)}"
-    elif get_origin(pydantic_type) is Optional:
+    elif origin_type is Optional:
         element_type = get_args(pydantic_type)[0]
         return f"optional-{map_pydantic_type_to_gbnf(element_type)}"
-    elif isclass(pydantic_type):
-        return f"{PydanticDataType.CUSTOM_CLASS.value}-{format_model_and_field_name(pydantic_type.__name__)}"
-    elif get_origin(pydantic_type) is dict:
+    elif isclass(origin_type):
+        return f"{PydanticDataType.CUSTOM_CLASS.value}-{format_model_and_field_name(origin_type.__name__)}"
+    elif origin_type is dict:
         key_type, value_type = get_args(pydantic_type)
         return f"custom-dict-key-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(key_type))}-value-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(value_type))}"
     else:
@@ -118,7 +121,7 @@ def get_members_structure(cls, rule_name):
         # Modify this comprehension
         members = [
             f'  "\\"{name}\\"" ":"  {map_pydantic_type_to_gbnf(param_type)}'
-            for name, param_type in cls.__annotations__.items()
+            for name, param_type in get_type_hints(cls).items()
             if name != "self"
         ]
 
@@ -297,17 +300,20 @@ def generate_gbnf_rule_for_type(
     field_name = format_model_and_field_name(field_name)
     gbnf_type = map_pydantic_type_to_gbnf(field_type)
 
-    if isclass(field_type) and issubclass(field_type, BaseModel):
+    origin_type = get_origin(field_type)
+    origin_type = field_type if origin_type is None else origin_type
+
+    if isclass(origin_type) and issubclass(origin_type, BaseModel):
         nested_model_name = format_model_and_field_name(field_type.__name__)
         nested_model_rules, _ = generate_gbnf_grammar(field_type, processed_models, created_rules)
         rules.extend(nested_model_rules)
         gbnf_type, rules = nested_model_name, rules
-    elif isclass(field_type) and issubclass(field_type, Enum):
+    elif isclass(origin_type) and issubclass(origin_type, Enum):
         enum_values = [f'"\\"{e.value}\\""' for e in field_type]  # Adding escaped quotes
         enum_rule = f"{model_name}-{field_name} ::= {' | '.join(enum_values)}"
         rules.append(enum_rule)
         gbnf_type, rules = model_name + "-" + field_name, rules
-    elif get_origin(field_type) == list:  # Array
+    elif origin_type is list:  # Array
         element_type = get_args(field_type)[0]
         element_rule_name, additional_rules = generate_gbnf_rule_for_type(
             model_name, f"{field_name}-element", element_type, is_optional, processed_models, created_rules
@@ -317,7 +323,7 @@ def generate_gbnf_rule_for_type(
         rules.append(array_rule)
         gbnf_type, rules = model_name + "-" + field_name, rules
 
-    elif get_origin(field_type) == set or field_type == set:  # Array
+    elif origin_type is set:  # Array
         element_type = get_args(field_type)[0]
         element_rule_name, additional_rules = generate_gbnf_rule_for_type(
             model_name, f"{field_name}-element", element_type, is_optional, processed_models, created_rules
@@ -371,7 +377,7 @@ def generate_gbnf_rule_for_type(
             gbnf_type = f"{model_name}-{field_name}-optional"
         else:
             gbnf_type = f"{model_name}-{field_name}-union"
-    elif isclass(field_type) and issubclass(field_type, str):
+    elif isclass(origin_type) and issubclass(origin_type, str):
         if field_info and hasattr(field_info, "json_schema_extra") and field_info.json_schema_extra is not None:
             triple_quoted_string = field_info.json_schema_extra.get("triple_quoted_string", False)
             markdown_string = field_info.json_schema_extra.get("markdown_code_block", False)
@@ -387,8 +393,8 @@ def generate_gbnf_rule_for_type(
             gbnf_type = PydanticDataType.STRING.value
 
     elif (
-        isclass(field_type)
-        and issubclass(field_type, float)
+        isclass(origin_type)
+        and issubclass(origin_type, float)
         and field_info
         and hasattr(field_info, "json_schema_extra")
         and field_info.json_schema_extra is not None
@@ -413,8 +419,8 @@ def generate_gbnf_rule_for_type(
         )
 
     elif (
-        isclass(field_type)
-        and issubclass(field_type, int)
+        isclass(origin_type)
+        and issubclass(origin_type, int)
         and field_info
         and hasattr(field_info, "json_schema_extra")
         and field_info.json_schema_extra is not None
@@ -462,15 +468,15 @@ def generate_gbnf_grammar(model: type[BaseModel], processed_models: set[type[Bas
     if not issubclass(model, BaseModel):
         # For non-Pydantic classes, generate model_fields from __annotations__ or __init__
         if hasattr(model, "__annotations__") and model.__annotations__:
-            model_fields = {name: (typ, ...) for name, typ in model.__annotations__.items()}  # pyright: ignore[reportGeneralTypeIssues]
+            model_fields = {name: (typ, ...) for name, typ in get_type_hints(model).items()}
         else:
             init_signature = inspect.signature(model.__init__)
             parameters = init_signature.parameters
             model_fields = {name: (param.annotation, param.default) for name, param in parameters.items() if
                             name != "self"}
     else:
         # For Pydantic models, use model_fields and check for ellipsis (required fields)
-        model_fields = model.__annotations__
+        model_fields = get_type_hints(model)
 
     model_rule_parts = []
     nested_rules = []
@@ -706,7 +712,7 @@ def generate_markdown_documentation(
         else:
             documentation += f"  Fields:\n"  # noqa: F541
         if isclass(model) and issubclass(model, BaseModel):
-            for name, field_type in model.__annotations__.items():
+            for name, field_type in get_type_hints(model).items():
                 # if name == "markdown_code_block":
                 #    continue
                 if get_origin(field_type) == list:
@@ -754,14 +760,17 @@ def generate_field_markdown(
     field_info = model.model_fields.get(field_name)
     field_description = field_info.description if field_info and field_info.description else ""
 
-    if get_origin(field_type) == list:
+    origin_type = get_origin(field_type)
+    origin_type = field_type if origin_type is None else origin_type
+
+    if origin_type == list:
         element_type = get_args(field_type)[0]
         field_text = f"{indent}{field_name} ({format_model_and_field_name(field_type.__name__)} of {format_model_and_field_name(element_type.__name__)})"
         if field_description != "":
             field_text += ":\n"
         else:
             field_text += "\n"
-    elif get_origin(field_type) == Union:
+    elif origin_type == Union:
         element_types = get_args(field_type)
         types = []
         for element_type in element_types:
@@ -792,9 +801,9 @@ def generate_field_markdown(
             example_text = f"'{field_example}'" if isinstance(field_example, str) else field_example
             field_text += f"{indent}  Example: {example_text}\n"
 
-    if isclass(field_type) and issubclass(field_type, BaseModel):
+    if isclass(origin_type) and issubclass(origin_type, BaseModel):
         field_text += f"{indent}  Details:\n"
-        for name, type_ in field_type.__annotations__.items():
+        for name, type_ in get_type_hints(field_type).items():
             field_text += generate_field_markdown(name, type_, field_type, depth + 2)
 
     return field_text
@@ -855,7 +864,7 @@ def generate_text_documentation(
 
         if isclass(model) and issubclass(model, BaseModel):
             documentation_fields = ""
-            for name, field_type in model.__annotations__.items():
+            for name, field_type in get_type_hints(model).items():
                 # if name == "markdown_code_block":
                 #    continue
                 if get_origin(field_type) == list:
@@ -948,7 +957,7 @@ def generate_field_text(
 
     if isclass(field_type) and issubclass(field_type, BaseModel):
         field_text += f"{indent}  Details:\n"
-        for name, type_ in field_type.__annotations__.items():
+        for name, type_ in get_type_hints(field_type).items():
             field_text += generate_field_text(name, type_, field_type, depth + 2)
 
     return field_text
 
@@ -20,6 +20,8 @@ def create_completion(prompt, grammar):
     response = requests.post("http://127.0.0.1:8080/completion", headers=headers, json=data)
     data = response.json()
 
+    assert data.get("error") is None, data
+
     print(data["content"])
     return data["content"]