EricLBuehler
diff --git a/‎.typos.toml
Lines changed: 4 additions & 1 deletion b/‎.typos.toml
Lines changed: 4 additions & 1 deletion
diff --git a/‎README.md
Lines changed: 6 additions & 0 deletions b/‎README.md
Lines changed: 6 additions & 0 deletions
diff --git a/‎docs/ISQ.md
Lines changed: 7 additions & 0 deletions b/‎docs/ISQ.md
Lines changed: 7 additions & 0 deletions
diff --git a/‎docs/UQFF.md
Lines changed: 7 additions & 0 deletions b/‎docs/UQFF.md
Lines changed: 7 additions & 0 deletions
diff --git a/‎mistralrs-core/src/pipeline/isq.rs
Lines changed: 29 additions & 17 deletions b/‎mistralrs-core/src/pipeline/isq.rs
Lines changed: 29 additions & 17 deletions
diff --git a/‎mistralrs-core/src/pipeline/normal.rs
Lines changed: 5 additions & 1 deletion b/‎mistralrs-core/src/pipeline/normal.rs
Lines changed: 5 additions & 1 deletion
diff --git a/‎mistralrs-core/src/pipeline/vision.rs
Lines changed: 5 additions & 1 deletion b/‎mistralrs-core/src/pipeline/vision.rs
Lines changed: 5 additions & 1 deletion
@@ -6,7 +6,10 @@ extend-ignore-identifiers-re = [
     "Nd",
     "nin",
     "cudaDevAttrMaxSharedMemoryPerBlockOptin",
-    "_thw"
+    "_thw",
+    "thr",
+    "nd",
+    "uneeded"
 ]
 
 [files]
 
@@ -31,6 +31,12 @@ Please submit requests for new models [here](https://github.com/EricLBuehler/mis
 - Check out UQFF for prequantized models of various methods!
     - Models can be found [here](https://huggingface.co/collections/EricB/uqff-670e4a49d56ecdd3f7f0fd4c).
 
+- 🔥 Try out AFQ for blazingly fast Metal performance!
+
+    ```
+    ./mistralrs-server -i --isq afq8 plain -m meta-llama/Llama-3.2-3B-Instruct
+    ```
+
 - 🔍🌐 Easily add web search capabilities to your models! Compatible with OpenAI's `web_search_options` parameter: [documentation](docs/WEB_SEARCH.md)
 
     ```
 
@@ -6,7 +6,14 @@ An API is exposed on the Python and Rust APIs which provide the ability to dynam
 
 To set the ISQ type for individual layers, use a model [`topology`](TOPOLOGY.md).
 
+> Note: 🔥 AFQ (affine) quantization is fast on **Metal**
+
 ## ISQ quantization types
+- AFQ2
+- AFQ3
+- AFQ4
+- AFQ6
+- AFQ8
 - Q4_0
 - Q4_1
 - Q5_0
 
@@ -54,6 +54,13 @@ The following quantization formats are supported in UQFF. One can, of course, be
 - FP8:
     - FP8 E4M3 (4-bit exponent, 3-bit mantissa)
 
+- AFQ quantized (🔥 AFQ is fast on **Metal**):
+    - AFQ2
+    - AFQ3
+    - AFQ4
+    - AFQ6
+    - AFQ8
+
 ## Loading a UQFF model
 
 To load a UQFF model, one should specify the filename. This will be located based on the model ID, and can
 
@@ -14,9 +14,9 @@ use candle_core::{quantized, Context, Device, Tensor};
 use indicatif::{MultiProgress, ParallelProgressIterator, ProgressBar, ProgressStyle};
 use itertools::Itertools;
 use mistralrs_quant::{
-    CollectedImatrixData, ColumnParallelLayer, DistributedKind, FP8Linear, GgufMatMul, HqqLayer,
-    IsqType, QuantMethod, QuantizeOntoGuard, QuantizedSerde, QuantizedSerdeType, ReplicatedLayer,
-    RowParallelLayer, UnquantLinear,
+    AfqLayer, CollectedImatrixData, ColumnParallelLayer, DistributedKind, FP8Linear, GgufMatMul,
+    HqqLayer, IsqType, QuantMethod, QuantizeOntoGuard, QuantizedSerde, QuantizedSerdeType,
+    ReplicatedLayer, RowParallelLayer, UnquantLinear,
 };
 use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator};
 use regex::Regex;
@@ -63,10 +63,15 @@ pub fn parse_isq_value(s: &str) -> Result<IsqType, String> {
         "hqq8" => IsqType::HQQ8,
         "hqq4" => IsqType::HQQ4,
         "fp8" => IsqType::F8E4M3,
+        "afq8" => IsqType::AFQ8,
+        "afq6" => IsqType::AFQ6,
+        "afq4" => IsqType::AFQ4,
+        "afq3" => IsqType::AFQ3,
+        "afq2" => IsqType::AFQ2,
         // "hqq3" => IsqType::HQQ3,
         // "hqq2" => IsqType::HQQ2,
         // "hqq1" => IsqType::HQQ1,
-        _ => return Err(format!("ISQ type {s} unknown, choose one of `Q4_0`, `Q4_1`, `Q5_0`, `Q5_1`, `Q8_0`, `Q8_1`, `Q2K`, `Q3K`, `Q4K`, `Q5K`, `Q6K`, `Q8K`, `HQQ8`, `HQQ4`, `FP8`.")),
+        _ => return Err(format!("ISQ type {s} unknown, choose one of `Q4_0`, `Q4_1`, `Q5_0`, `Q5_1`, `Q8_0`, `Q8_1`, `Q2K`, `Q3K`, `Q4K`, `Q5K`, `Q6K`, `Q8K`, `HQQ8`, `HQQ4`, `FP8`, `AFQ8`, `AFQ6`, `AFQ4`, `AFQ3`, `AFQ2`.")),
     };
     #[cfg(feature = "cuda")]
     {
@@ -442,19 +447,14 @@ pub trait IsqModel {
             // Get the MINIMUM of the max isq threads the quant method
             let mut minimum_max_threads = {
                 let current_rayon_threads = rayon::current_num_threads();
-                tensors
-                    .iter()
-                    .map(|(q, _)| {
-                        if let Some(dtype) = dtype {
-                            q.get_max_isq_cpu_threads(dtype)
-                                .map(usize::from)
-                                .unwrap_or(current_rayon_threads)
-                        } else {
-                            current_rayon_threads
-                        }
-                    })
-                    .min()
-                    .unwrap_or(current_rayon_threads)
+                if let Some(dtype) = dtype {
+                    dtype
+                        .get_max_isq_cpu_threads()
+                        .map(usize::from)
+                        .unwrap_or(current_rayon_threads)
+                } else {
+                    current_rayon_threads
+                }
             };
             if env::var("MISTRALRS_ISQ_SINGLETHREAD").is_ok() {
                 minimum_max_threads = 1;
@@ -807,6 +807,12 @@ pub trait IsqModel {
                                         &comm,
                                         guard.clone(),
                                     )?,
+                                    QuantizedSerdeType::Afq => AfqLayer::deserialize(
+                                        Cow::from(artifact),
+                                        &devices[i],
+                                        &comm,
+                                        guard.clone(),
+                                    )?,
                                 }
                             }
                         };
@@ -874,6 +880,12 @@ pub trait IsqModel {
                                         &comm,
                                         guard.clone(),
                                     )?,
+                                    QuantizedSerdeType::Afq => AfqLayer::deserialize(
+                                        Cow::from(artifact),
+                                        &devices[i],
+                                        &comm,
+                                        guard.clone(),
+                                    )?,
                                 }
                             }
                         };
 
@@ -43,7 +43,7 @@ use candle_core::{Device, Tensor, Var};
 use hf_hub::Cache;
 use hf_hub::{api::sync::ApiBuilder, Repo, RepoType};
 use indicatif::MultiProgress;
-use mistralrs_quant::{GgufMatMul, HqqLayer, IsqType, QuantizedSerdeType};
+use mistralrs_quant::{AfqLayer, GgufMatMul, HqqLayer, IsqType, QuantizedSerdeType};
 use rand_isaac::Isaac64Rng;
 use regex_automata::meta::Regex;
 use std::any::Any;
@@ -365,6 +365,10 @@ impl Loader for NormalLoader {
                                 }
                                 QuantizedSerdeType::Fp8 => IsqType::F8E4M3.pack_factor(dtype),
                                 QuantizedSerdeType::Unquant => 1,
+                                QuantizedSerdeType::Afq => {
+                                    AfqLayer::get_isq_type_from_uqff(Cow::Borrowed(artifact))?
+                                        .pack_factor(dtype)
+                                }
                             };
                             total_pack_factors += pack_factor;
                         }
 
@@ -38,7 +38,7 @@ use candle_core::{Device, Tensor, Var};
 use hf_hub::Cache;
 use hf_hub::{api::sync::ApiBuilder, Repo, RepoType};
 use indicatif::MultiProgress;
-use mistralrs_quant::{GgufMatMul, HqqLayer, IsqType, QuantizedSerdeType};
+use mistralrs_quant::{AfqLayer, GgufMatMul, HqqLayer, IsqType, QuantizedSerdeType};
 use rand_isaac::Isaac64Rng;
 use regex_automata::meta::Regex;
 use std::any::Any;
@@ -305,6 +305,10 @@ impl Loader for VisionLoader {
                                 }
                                 QuantizedSerdeType::Fp8 => IsqType::F8E4M3.pack_factor(dtype),
                                 QuantizedSerdeType::Unquant => 1,
+                                QuantizedSerdeType::Afq => {
+                                    AfqLayer::get_isq_type_from_uqff(Cow::Borrowed(artifact))?
+                                        .pack_factor(dtype)
+                                }
                             };
                             total_pack_factors += pack_factor;
                         }