Update docs

EricLBuehler · EricLBuehler · commit d92d2b521991 · 2025-06-04T07:37:51.000-04:00
diff --git a/docs/WEB_SEARCH.md b/docs/WEB_SEARCH.md
@@ -30,6 +30,28 @@ By default, mistral.rs uses a DuckDuckGo-based search callback. To override this
 - Rust: use `.with_search_callback(...)` on the model builder with an `Arc<dyn Fn(&SearchFunctionParameters) -> anyhow::Result<Vec<SearchResult>> + Send + Sync>`.
 - Python: pass the `search_callback` keyword argument to `Runner`, which should be a function `def search_callback(query: str) -> List[Dict[str, str]]` returning a list of results with keys `"title"`, `"description"`, `"url"`, and `"content"`.
 
+Example in Python:
+```py
+def search_callback(query: str) -> list[dict[str, str]]:
+    # Implement your custom search logic here, returning a list of result dicts
+    return [
+        {
+            "title": "Example Result",
+            "description": "An example description",
+            "url": "https://example.com",
+            "content": "Full text content of the page",
+        },
+        # more results...
+    ]
+
+from mistralrs import Runner, Which, Architecture
+runner = Runner(
+    which=Which.Plain(model_id="YourModel/ID", arch=Architecture.Mistral),
+    enable_search=True,
+    search_callback=search_callback,
+)
+```
+
 ## HTTP server
 **Be sure to add `--enable-search`!**
 
@@ -87,12 +109,25 @@ from mistralrs import (
     WebSearchOptions,
 )
 
+# Define a custom search callback if desired
+def my_search_callback(query: str) -> list[dict[str, str]]:
+    # Fetch or compute search results here
+    return [
+        {
+            "title": "Mistral.rs GitHub",
+            "description": "Official mistral.rs repository",
+            "url": "https://github.com/huggingface/mistral.rs",
+            "content": "mistral.rs is a Rust binding for Mistral models...",
+        },
+    ]
+
 runner = Runner(
     which=Which.Plain(
         model_id="NousResearch/Hermes-3-Llama-3.1-8B",
         arch=Architecture.Llama,
     ),
     enable_search=True,
+    search_callback=my_search_callback,
 )
 
 res = runner.send_chat_completion_request(
diff --git a/examples/python/local_search.py b/examples/python/local_search.py
@@ -7,25 +7,29 @@
 )
 import os
 
+
 def local_search(query: str):
     results = []
-    for root, _, files in os.walk('.'):
+    for root, _, files in os.walk("."):
         for f in files:
             if query in f:
                 path = os.path.join(root, f)
                 try:
                     content = open(path).read()
                 except Exception:
                     content = ""
-                results.append({
-                    "title": f,
-                    "description": path,
-                    "url": path,
-                    "content": content,
-                })
-    results.sort(key=lambda r: r['title'], reverse=True)
+                results.append(
+                    {
+                        "title": f,
+                        "description": path,
+                        "url": path,
+                        "content": content,
+                    }
+                )
+    results.sort(key=lambda r: r["title"], reverse=True)
     return results
 
+
 runner = Runner(
     which=Which.Plain(
         model_id="NousResearch/Hermes-3-Llama-3.1-8B",
@@ -40,7 +44,9 @@ def local_search(query: str):
         model="mistral",
         messages=[{"role": "user", "content": "Where is Cargo.toml in this repo?"}],
         max_tokens=64,
-        web_search_options=WebSearchOptions(search_description="Local filesystem search"),
+        web_search_options=WebSearchOptions(
+            search_description="Local filesystem search"
+        ),
     )
 )
 print(res.choices[0].message.content)
diff --git a/mistralrs-pyo3/mistralrs.pyi b/mistralrs-pyo3/mistralrs.pyi
@@ -1,6 +1,6 @@
 from dataclasses import dataclass
 from enum import Enum
-from typing import Iterator, Literal, Optional
+from typing import Iterator, Literal, Optional, Callable
 
 class SearchContextSize(Enum):
     Low = "low"
@@ -345,7 +345,9 @@ class Runner:
         paged_attn: bool = False,
         prompt_batchsize: int | None = None,
         seed: int | None = None,
+        enable_search: bool = False,
         search_bert_model: str | None = None,
+        search_callback: Callable[[str], list[dict[str, str]]] | None = None,
         no_bert_model: bool = False,
     ) -> None:
         """
@@ -389,6 +391,7 @@ class Runner:
         - `seed`, used to ensure reproducible random number generation.
         - `enable_search`: Enable searching compatible with the OpenAI `web_search_options` setting. This uses the BERT model specified below or the default.
         - `search_bert_model`: specify a Hugging Face model ID for a BERT model to assist web searching. Defaults to Snowflake Arctic Embed L.
+        - `search_callback`: Custom Python callable to perform web searches. Should accept a query string and return a list of dicts with keys "title", "description", "url", and "content".
         """
         ...
 
diff --git a/mistralrs-quant/kernels/marlin/marlin_kernel.cu b/mistralrs-quant/kernels/marlin/marlin_kernel.cu
@@ -86,8 +86,8 @@ dequant<half, ScalarTypeID::kU4B8>(int q) {
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
   // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
   // directly into `SUB` and `ADD`.
   const int SUB = 0x64086408;
@@ -110,9 +110,9 @@ dequant<nv_bfloat16, ScalarTypeID::kU4B8>(int q) {
 
   // Guarantee that the `(a & b) | c` operations are LOP3s.
 
-  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
   q >>= 4;
-  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
 
   typename ScalarType<nv_bfloat16>::FragB frag_b;
   static constexpr uint32_t MUL = 0x3F803F80;
@@ -135,8 +135,8 @@ dequant<half, ScalarTypeID::kU4>(int q) {
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
 
   const int SUB = 0x64006400;
   const int MUL = 0x2c002c00;
@@ -158,9 +158,9 @@ dequant<nv_bfloat16, ScalarTypeID::kU4>(int q) {
 
   // Guarantee that the `(a & b) | c` operations are LOP3s.
 
-  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
   q >>= 4;
-  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
 
   typename ScalarType<nv_bfloat16>::FragB frag_b;
   static constexpr uint32_t MUL = 0x3F803F80;
diff --git a/scripts/generate_uqff_card.py b/scripts/generate_uqff_card.py
@@ -54,7 +54,7 @@
         )
         file = input("Enter UQFF filename (with extension): ").strip()
         if ";" in file:
-            file = f"\"{file}\""
+            file = f'"{file}"'
 
         quants = input(
             "Enter quantization NAMES used to make that file (single quantization name, OR if multiple, comma delimited): "

Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,7 @@`
`54`	`54`	`)`
`55`	`55`	`file = input("Enter UQFF filename (with extension): ").strip()`
`56`	`56`	`if ";" in file:`
`57`		`- file = f"\"{file}\""`
	`57`	`+ file = f'"{file}"'`
`58`	`58`
`59`	`59`	`quants = input(`
`60`	`60`	`"Enter quantization NAMES used to make that file (single quantization name, OR if multiple, comma delimited): "`