ashvardanian
diff --git a/‎.github/workflows/prerelease.yml
Lines changed: 5 additions & 5 deletions b/‎.github/workflows/prerelease.yml
Lines changed: 5 additions & 5 deletions
diff --git a/‎.github/workflows/release.yml
Lines changed: 8 additions & 8 deletions b/‎.github/workflows/release.yml
Lines changed: 8 additions & 8 deletions
diff --git a/‎CONTRIBUTING.md
Lines changed: 9 additions & 1 deletion b/‎CONTRIBUTING.md
Lines changed: 9 additions & 1 deletion
diff --git a/‎README.md
Lines changed: 175 additions & 34 deletions b/‎README.md
Lines changed: 175 additions & 34 deletions
@@ -27,7 +27,7 @@ jobs:
           fetch-depth: 0
           persist-credentials: false
       - name: Run TinySemVer
-        uses: ashvardanian/tinysemver@v2.0.7
+        uses: ashvardanian/tinysemver@v2.1.1
         with:
           verbose: "true"
           version-file: "VERSION"
@@ -204,11 +204,11 @@ jobs:
         os: [ubuntu-latest, macos-latest, windows-latest]
         python-version: ["37", "38", "39", "310", "311", "312", "313"]
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v4.2.2
       - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v5.3.0
         with:
-          python-version: 3.x
+          python-version: 3.9
 
         # We only need QEMU for Linux builds
       - name: Setup QEMU
@@ -220,7 +220,7 @@ jobs:
         with:
           vs-version: "17.10"
       - name: Install cibuildwheel
-        run: python -m pip install cibuildwheel
+        run: python -m pip install cibuildwheel==2.21.3
       - name: Build wheels
         run: cibuildwheel --output-dir wheelhouse
         env:
 
@@ -26,7 +26,7 @@ jobs:
           fetch-depth: 0
           persist-credentials: false
       - name: Run TinySemVer
-        uses: ashvardanian/tinysemver@v2.0.7
+        uses: ashvardanian/tinysemver@v2.1.1
         with:
           verbose: "true"
           version-file: "VERSION"
@@ -80,13 +80,13 @@ jobs:
         os: [ubuntu-latest, macos-latest, windows-latest]
         python-version: ["37", "38", "39", "310", "311", "312", "313"]
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v4.2.2
         with:
           ref: "main"
       - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v5.3.0
         with:
-          python-version: 3.x
+          python-version: 3.9
       - name: Setup QEMU
         if: matrix.os == 'ubuntu-latest' # We only need QEMU for Linux builds
         uses: docker/setup-qemu-action@v3
@@ -96,7 +96,7 @@ jobs:
         with:
           vs-version: "17.10"
       - name: Install cibuildwheel
-        run: python -m pip install cibuildwheel
+        run: python -m pip install cibuildwheel==2.21.3
       - name: Build wheels
         run: cibuildwheel --output-dir wheelhouse
         env:
@@ -113,13 +113,13 @@ jobs:
     runs-on: ubuntu-latest
     needs: versioning
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v4.2.2
         with:
           ref: "main"
       - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v5.3.0
         with:
-          python-version: 3.x
+          python-version: 3.9
       - name: Build source distribution
         run: pip install build &&
           python -m build --sdist
 
@@ -63,6 +63,15 @@ cmake -D CMAKE_BUILD_TYPE=Release \
 cmake --build build_release --config Release
 ```
 
+When benchmarking, make sure to disable multi-threading in the BLAS library, as it may interfere with the results:
+
+```sh
+export OPENBLAS_NUM_THREADS=1 # for OpenBLAS
+export MKL_NUM_THREADS=1 # for Intel MKL
+export VECLIB_MAXIMUM_THREADS=1 # for Apple Accelerate
+export BLIS_NUM_THREADS=1 # for BLIS
+```
+
 ## Python
 
 Testing:
@@ -140,7 +149,6 @@ cibuildwheel --platform linux                   # works on any OS and builds all
 cibuildwheel --platform linux --archs x86_64    # 64-bit x86, the most common on desktop and servers
 cibuildwheel --platform linux --archs aarch64   # 64-bit Arm for mobile devices, Apple M-series, and AWS Graviton
 cibuildwheel --platform linux --archs i686      # 32-bit Linux
-cibuildwheel --platform linux --archs s390x     # emulating big-endian IBM Z
 cibuildwheel --platform macos                   # works only on MacOS
 cibuildwheel --platform windows                 # works only on Windows
 ```
 
@@ -9,7 +9,7 @@ The rare few that support minimal mixed precision, run only on one platform, and
 SimSIMD provides an alternative.
 1️⃣ SimSIMD functions are practically as fast as `memcpy`.
 2️⃣ Unlike BLAS, most kernels are designed for mixed-precision and bit-level operations.
-3️⃣ SimSIMD [compiles to more platforms than NumPy (105 vs 35)][compatibility] and has more backends than most BLAS implementations, and more high-level interfaces than most libraries.
+3️⃣ SimSIMD often [ships more binaries than NumPy][compatibility] and has more backends than most BLAS implementations, and more high-level interfaces than most libraries.
 
 [benchmarks]: https://ashvardanian.com/posts/simsimd-faster-scipy
 [compatibility]: https://pypi.org/project/simsimd/#files
@@ -42,7 +42,7 @@ SimSIMD provides an alternative.
 
 ## Features
 
-__SimSIMD__ (Arabic: "سيمسيم دي") is a mixed-precision math library of __over 200 SIMD-optimized kernels__ extensively used in AI, Search, and DBMS workloads.
+__SimSIMD__ (Arabic: "سيمسيم دي") is a mixed-precision math library of __over 350 SIMD-optimized kernels__ extensively used in AI, Search, and DBMS workloads.
 Named after the iconic ["Open Sesame"](https://en.wikipedia.org/wiki/Open_sesame) command that opened doors to treasure in _Ali Baba and the Forty Thieves_, SimSimd can help you 10x the cost-efficiency of your computational pipelines.
 Implemented distance functions include:
 
@@ -92,17 +92,136 @@ You can learn more about the technical implementation details in the following b
 
 ## Benchmarks
 
-For reference, we use 1536-dimensional vectors, like the embeddings produced by the OpenAI Ada API.
-Comparing the serial code throughput produced by GCC 12 to hand-optimized kernels in SimSIMD, we see the following single-core improvements for the two most common vector-vector similarity metrics - the Cosine similarity and the Euclidean distance:
+<table style="width: 100%; text-align: center; table-layout: fixed;">
+  <colgroup>
+    <col style="width: 33%;">
+    <col style="width: 33%;">
+    <col style="width: 33%;">
+  </colgroup>
+  <tr>
+    <th align="center">NumPy</th>
+    <th align="center">C 99</th>
+    <th align="center">SimSIMD</th>
+  </tr>
+  <!-- Cosine distance with different precision levels -->
+  <tr>
+    <td colspan="4" align="center">cosine distances between 1536d vectors in <code>float16</code></td>
+  </tr>
+  <tr>
+    <td align="center"> <!-- scipy.spatial.distance.cosine -->
+      <code>int8</code><br/>
+      🚧 overflows<br/>
+      <code>bfloat16</code><br/>
+      🚧 not supported<br/>
+      <code>float16</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>40,481</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>21,451</b> ops/s
+      <code>float32</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>253,902</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>46,394</b> ops/s
+      <code>float64</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>212,421</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>52,904</b> ops/s
+    </td>
+    <td align="center"> <!-- serial -->
+      <code>int8</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>10,548,600</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>11,379,300</b> ops/s
+      <code>bfloat16</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>119,835</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>403,909</b> ops/s
+      <code>float16</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>501,310</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>871,963</b> ops/s
+      <code>float32</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>882,484</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>399,661</b> ops/s
+      <code>float64</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>839,301</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>837,126</b> ops/s
+    </td>
+    <td align="center"> <!-- simsimd -->
+      <code>int8</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>16,151,800</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>13,524,000</b> ops/s
+      <code>bfloat16</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>9,738,540</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>4,881,900</b> ops/s
+      <code>float16</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>7,627,600</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>3,316,810</b> ops/s
+      <code>float32</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>8,202,910</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>3,400,620</b> ops/s
+      <code>float64</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>1,538,530</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>1,678,920</b> ops/s
+    </td>
+  </tr>
+  <!-- Euclidean distance with different precision level -->
+  <tr>
+    <td colspan="4" align="center">eculidean distance between 1536d vectors in <code>float16</code></td>
+  </tr>
+  <tr>
+    <td align="center"> <!-- scipy.spatial.distance.sqeuclidean -->
+      <code>int8</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>252,113</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>177,443</b> ops/s
+      <code>bfloat16</code><br/>
+      🚧 not supported<br/>
+      <code>float16</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>54,621</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>71,793</b> ops/s
+      <code>float32</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>424,944</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>292,629</b> ops/s
+      <code>float64</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>334,929</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>237,505</b> ops/s
+    </td>
+    <td align="center"> <!-- serial -->
+      <code>int8</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>6,690,110</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>4,114,160</b> ops/s
+      <code>bfloat16</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>119,842</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>1,049,230</b> ops/s
+      <code>float16</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>196,413</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>911,370</b> ops/s
+      <code>float32</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>1,295,210</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>1,055,940</b> ops/s
+      <code>float64</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>1,215,190</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>905,782</b> ops/s
+    </td>
+    <td align="center"> <!-- simsimd -->
+      <code>int8</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>18,989,000</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>18,878,200</b> ops/s
+      <code>bfloat16</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>9,727,210</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>4,233,420</b> ops/s
+      <code>float16</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>19,466,800</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>3,522,760</b> ops/s
+      <code>float32</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>8,924,100</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>3,602,650</b> ops/s
+      <code>float64</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>1,701,740</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>1,735,840</b> ops/s
+    </td>
+  </tr>
+  <!-- Bilinear forms -->
+  <!-- Sparse set intersections -->
+</table>
 
-| Type       |                  Apple M2 Pro |            Intel Sapphire Rapids |                  AWS Graviton 4 |
-| :--------- | ----------------------------: | -------------------------------: | ------------------------------: |
-| `float64`  | 18.5 → 28.8 GB/s <br/> + 56 % |    21.9 → 41.4 GB/s <br/> + 89 % |   20.7 → 41.3 GB/s <br/> + 99 % |
-| `float32`  | 9.2 → 29.6 GB/s <br/> + 221 % |   10.9 → 95.8 GB/s <br/> + 779 % |   4.9 → 41.9 GB/s <br/> + 755 % |
-| `float16`  | 4.6 → 14.6 GB/s <br/> + 217 % | 3.1 → 108.4 GB/s <br/> + 3,397 % |   5.4 → 39.3 GB/s <br/> + 627 % |
-| `bfloat16` | 4.6 → 26.3 GB/s <br/> + 472 % |   0.8 → 59.5 GB/s <br/> +7,437 % | 2.5 → 29.9 GB/s <br/> + 1,096 % |
-| `int8`     | 25.8 → 47.1 GB/s <br/> + 83 % |    33.1 → 65.3 GB/s <br/> + 97 % |   35.2 → 43.5 GB/s <br/> + 24 % |
-| `uint8`    |                               |   32.5 → 66.5 GB/s <br/> + 105 % |                                 |
+> The code was compiled with GCC 12, using glibc v2.35.
+> The benchmarks performed on Arm-based Graviton3 AWS `c7g` instances and `r7iz` Intel Sapphire Rapids.
+> Most modern Arm-based 64-bit CPUs will have similar relative speedups.
+> Variance withing x86 CPUs will be larger.
 
 Similar speedups are often observed even when compared to BLAS and LAPACK libraries underlying most numerical computing libraries, including NumPy and SciPy in Python.
 Broader benchmarking results:
@@ -689,23 +808,29 @@ To override compilation settings and switch between runtime and compile-time dis
 #include <simsimd/simsimd.h>
 
 int main() {
+    simsimd_i8_t i8[1536];
+    simsimd_i8_t u8[1536];
     simsimd_f64_t f64s[1536];
     simsimd_f32_t f32s[1536];
     simsimd_f16_t f16s[1536];
-    simsimd_i8_t i8[1536];
+    simsimd_bf16_t bf16s[1536];
     simsimd_distance_t distance;
 
     // Cosine distance between two vectors
     simsimd_cos_i8(i8s, i8s, 1536, &distance);
+    simsimd_cos_u8(u8s, u8s, 1536, &distance);
     simsimd_cos_f16(f16s, f16s, 1536, &distance);
     simsimd_cos_f32(f32s, f32s, 1536, &distance);
     simsimd_cos_f64(f64s, f64s, 1536, &distance);
+    simsimd_cos_bf16(bf16s, bf16s, 1536, &distance);
     
     // Euclidean distance between two vectors
     simsimd_l2sq_i8(i8s, i8s, 1536, &distance);
+    simsimd_l2sq_u8(u8s, u8s, 1536, &distance);
     simsimd_l2sq_f16(f16s, f16s, 1536, &distance);
     simsimd_l2sq_f32(f32s, f32s, 1536, &distance);
     simsimd_l2sq_f64(f64s, f64s, 1536, &distance);
+    simsimd_l2sq_bf16(bf16s, bf16s, 1536, &distance);
 
     return 0;
 }
@@ -717,25 +842,41 @@ int main() {
 #include <simsimd/simsimd.h>
 
 int main() {
-    simsimd_f64_t f64s[1536];
-    simsimd_f32_t f32s[1536];
+    // SimSIMD provides "sized" type-aliases without relying on `stdint.h`
+    simsimd_i8_t i8[1536];
+    simsimd_i8_t u8[1536];
     simsimd_f16_t f16s[1536];
-    simsimd_distance_t distance;
-
-    // Inner product between two vectors
-    simsimd_dot_f16(f16s, f16s, 1536, &distance);
-    simsimd_dot_f32(f32s, f32s, 1536, &distance);
-    simsimd_dot_f64(f64s, f64s, 1536, &distance);
+    simsimd_f32_t f32s[1536];
+    simsimd_f64_t f64s[1536];
+    simsimd_bf16_t bf16s[1536];
+    simsimd_distance_t product;
+
+    // Inner product between two real vectors
+    simsimd_dot_i8(i8s, i8s, 1536, &product);
+    simsimd_dot_u8(u8s, u8s, 1536, &product);
+    simsimd_dot_f16(f16s, f16s, 1536, &product);
+    simsimd_dot_f32(f32s, f32s, 1536, &product);
+    simsimd_dot_f64(f64s, f64s, 1536, &product);
+    simsimd_dot_bf16(bf16s, bf16s, 1536, &product);
+
+    // SimSIMD provides complex types with `real` and `imag` fields
+    simsimd_f64c_t f64s[768];
+    simsimd_f32c_t f32s[768];
+    simsimd_f16c_t f16s[768];
+    simsimd_bf16c_t bf16s[768];
+    simsimd_distance_t products[2]; // real and imaginary parts
 
     // Complex inner product between two vectors
-    simsimd_dot_f16c(f16s, f16s, 1536, &distance);
-    simsimd_dot_f32c(f32s, f32s, 1536, &distance);
-    simsimd_dot_f64c(f64s, f64s, 1536, &distance);
+    simsimd_dot_f16c(f16cs, f16cs, 768, &products[0]);
+    simsimd_dot_f32c(f32cs, f32cs, 768, &products[0]);
+    simsimd_dot_f64c(f64cs, f64cs, 768, &products[0]);
+    simsimd_dot_bf16c(bf16cs, bf16cs, 768, &products[0]);
 
     // Complex conjugate inner product between two vectors
-    simsimd_vdot_f16c(f16s, f16s, 1536, &distance);
-    simsimd_vdot_f32c(f32s, f32s, 1536, &distance);
-    simsimd_vdot_f64c(f64s, f64s, 1536, &distance);
+    simsimd_vdot_f16c(f16cs, f16cs, 768, &products[0]);
+    simsimd_vdot_f32c(f32cs, f32cs, 768, &products[0]);
+    simsimd_vdot_f64c(f64cs, f64cs, 768, &products[0]);
+    simsimd_vdot_bf16c(bf16cs, bf16cs, 768, &products[0]);
     return 0;
 }
 ```
@@ -763,17 +904,17 @@ int main() {
     simsimd_f64_t f64s[1536];
     simsimd_f32_t f32s[1536];
     simsimd_f16_t f16s[1536];
-    simsimd_distance_t distance;
+    simsimd_distance_t divergence;
 
     // Jensen-Shannon divergence between two vectors
-    simsimd_js_f16(f16s, f16s, 1536, &distance);
-    simsimd_js_f32(f32s, f32s, 1536, &distance);
-    simsimd_js_f64(f64s, f64s, 1536, &distance);
+    simsimd_js_f16(f16s, f16s, 1536, &divergence);
+    simsimd_js_f32(f32s, f32s, 1536, &divergence);
+    simsimd_js_f64(f64s, f64s, 1536, &divergence);
 
     // Kullback-Leibler divergence between two vectors
-    simsimd_kl_f16(f16s, f16s, 1536, &distance);
-    simsimd_kl_f32(f32s, f32s, 1536, &distance);
-    simsimd_kl_f64(f64s, f64s, 1536, &distance);
+    simsimd_kl_f16(f16s, f16s, 1536, &divergence);
+    simsimd_kl_f32(f32s, f32s, 1536, &divergence);
+    simsimd_kl_f64(f64s, f64s, 1536, &divergence);
     return 0;
 }
 ```