Skip to content

Commit 73a8520

Browse files
authored
Merge pull request #240 from ashvardanian/main-dev
Complex Bilinear Forms for Computational Physics
2 parents 37ea5a2 + acc61b5 commit 73a8520

File tree

14 files changed

+2083
-1047
lines changed

14 files changed

+2083
-1047
lines changed

.github/workflows/prerelease.yml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ jobs:
2727
fetch-depth: 0
2828
persist-credentials: false
2929
- name: Run TinySemVer
30-
uses: ashvardanian/tinysemver@v2.0.7
30+
uses: ashvardanian/tinysemver@v2.1.1
3131
with:
3232
verbose: "true"
3333
version-file: "VERSION"
@@ -204,11 +204,11 @@ jobs:
204204
os: [ubuntu-latest, macos-latest, windows-latest]
205205
python-version: ["37", "38", "39", "310", "311", "312", "313"]
206206
steps:
207-
- uses: actions/checkout@v4
207+
- uses: actions/checkout@v4.2.2
208208
- name: Set up Python
209-
uses: actions/setup-python@v5
209+
uses: actions/setup-python@v5.3.0
210210
with:
211-
python-version: 3.x
211+
python-version: 3.9
212212

213213
# We only need QEMU for Linux builds
214214
- name: Setup QEMU
@@ -220,7 +220,7 @@ jobs:
220220
with:
221221
vs-version: "17.10"
222222
- name: Install cibuildwheel
223-
run: python -m pip install cibuildwheel
223+
run: python -m pip install cibuildwheel==2.21.3
224224
- name: Build wheels
225225
run: cibuildwheel --output-dir wheelhouse
226226
env:

.github/workflows/release.yml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ jobs:
2626
fetch-depth: 0
2727
persist-credentials: false
2828
- name: Run TinySemVer
29-
uses: ashvardanian/tinysemver@v2.0.7
29+
uses: ashvardanian/tinysemver@v2.1.1
3030
with:
3131
verbose: "true"
3232
version-file: "VERSION"
@@ -80,13 +80,13 @@ jobs:
8080
os: [ubuntu-latest, macos-latest, windows-latest]
8181
python-version: ["37", "38", "39", "310", "311", "312", "313"]
8282
steps:
83-
- uses: actions/checkout@v4
83+
- uses: actions/checkout@v4.2.2
8484
with:
8585
ref: "main"
8686
- name: Set up Python
87-
uses: actions/setup-python@v5
87+
uses: actions/setup-python@v5.3.0
8888
with:
89-
python-version: 3.x
89+
python-version: 3.9
9090
- name: Setup QEMU
9191
if: matrix.os == 'ubuntu-latest' # We only need QEMU for Linux builds
9292
uses: docker/setup-qemu-action@v3
@@ -96,7 +96,7 @@ jobs:
9696
with:
9797
vs-version: "17.10"
9898
- name: Install cibuildwheel
99-
run: python -m pip install cibuildwheel
99+
run: python -m pip install cibuildwheel==2.21.3
100100
- name: Build wheels
101101
run: cibuildwheel --output-dir wheelhouse
102102
env:
@@ -113,13 +113,13 @@ jobs:
113113
runs-on: ubuntu-latest
114114
needs: versioning
115115
steps:
116-
- uses: actions/checkout@v4
116+
- uses: actions/checkout@v4.2.2
117117
with:
118118
ref: "main"
119119
- name: Set up Python
120-
uses: actions/setup-python@v5
120+
uses: actions/setup-python@v5.3.0
121121
with:
122-
python-version: 3.x
122+
python-version: 3.9
123123
- name: Build source distribution
124124
run: pip install build &&
125125
python -m build --sdist

CONTRIBUTING.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,15 @@ cmake -D CMAKE_BUILD_TYPE=Release \
6363
cmake --build build_release --config Release
6464
```
6565

66+
When benchmarking, make sure to disable multi-threading in the BLAS library, as it may interfere with the results:
67+
68+
```sh
69+
export OPENBLAS_NUM_THREADS=1 # for OpenBLAS
70+
export MKL_NUM_THREADS=1 # for Intel MKL
71+
export VECLIB_MAXIMUM_THREADS=1 # for Apple Accelerate
72+
export BLIS_NUM_THREADS=1 # for BLIS
73+
```
74+
6675
## Python
6776

6877
Testing:
@@ -140,7 +149,6 @@ cibuildwheel --platform linux # works on any OS and builds all
140149
cibuildwheel --platform linux --archs x86_64 # 64-bit x86, the most common on desktop and servers
141150
cibuildwheel --platform linux --archs aarch64 # 64-bit Arm for mobile devices, Apple M-series, and AWS Graviton
142151
cibuildwheel --platform linux --archs i686 # 32-bit Linux
143-
cibuildwheel --platform linux --archs s390x # emulating big-endian IBM Z
144152
cibuildwheel --platform macos # works only on MacOS
145153
cibuildwheel --platform windows # works only on Windows
146154
```

README.md

Lines changed: 175 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ The rare few that support minimal mixed precision, run only on one platform, and
99
SimSIMD provides an alternative.
1010
1️⃣ SimSIMD functions are practically as fast as `memcpy`.
1111
2️⃣ Unlike BLAS, most kernels are designed for mixed-precision and bit-level operations.
12-
3️⃣ SimSIMD [compiles to more platforms than NumPy (105 vs 35)][compatibility] and has more backends than most BLAS implementations, and more high-level interfaces than most libraries.
12+
3️⃣ SimSIMD often [ships more binaries than NumPy][compatibility] and has more backends than most BLAS implementations, and more high-level interfaces than most libraries.
1313

1414
[benchmarks]: https://ashvardanian.com/posts/simsimd-faster-scipy
1515
[compatibility]: https://pypi.org/project/simsimd/#files
@@ -42,7 +42,7 @@ SimSIMD provides an alternative.
4242

4343
## Features
4444

45-
__SimSIMD__ (Arabic: "سيمسيم دي") is a mixed-precision math library of __over 200 SIMD-optimized kernels__ extensively used in AI, Search, and DBMS workloads.
45+
__SimSIMD__ (Arabic: "سيمسيم دي") is a mixed-precision math library of __over 350 SIMD-optimized kernels__ extensively used in AI, Search, and DBMS workloads.
4646
Named after the iconic ["Open Sesame"](https://en.wikipedia.org/wiki/Open_sesame) command that opened doors to treasure in _Ali Baba and the Forty Thieves_, SimSimd can help you 10x the cost-efficiency of your computational pipelines.
4747
Implemented distance functions include:
4848

@@ -92,17 +92,136 @@ You can learn more about the technical implementation details in the following b
9292

9393
## Benchmarks
9494

95-
For reference, we use 1536-dimensional vectors, like the embeddings produced by the OpenAI Ada API.
96-
Comparing the serial code throughput produced by GCC 12 to hand-optimized kernels in SimSIMD, we see the following single-core improvements for the two most common vector-vector similarity metrics - the Cosine similarity and the Euclidean distance:
95+
<table style="width: 100%; text-align: center; table-layout: fixed;">
96+
<colgroup>
97+
<col style="width: 33%;">
98+
<col style="width: 33%;">
99+
<col style="width: 33%;">
100+
</colgroup>
101+
<tr>
102+
<th align="center">NumPy</th>
103+
<th align="center">C 99</th>
104+
<th align="center">SimSIMD</th>
105+
</tr>
106+
<!-- Cosine distance with different precision levels -->
107+
<tr>
108+
<td colspan="4" align="center">cosine distances between 1536d vectors in <code>float16</code></td>
109+
</tr>
110+
<tr>
111+
<td align="center"> <!-- scipy.spatial.distance.cosine -->
112+
<code>int8</code><br/>
113+
🚧 overflows<br/>
114+
<code>bfloat16</code><br/>
115+
🚧 not supported<br/>
116+
<code>float16</code><br/>
117+
<span style="color:#ABABAB;">x86:</span> <b>40,481</b> &centerdot;
118+
<span style="color:#ABABAB;">arm:</span> <b>21,451</b> ops/s
119+
<code>float32</code><br/>
120+
<span style="color:#ABABAB;">x86:</span> <b>253,902</b> &centerdot;
121+
<span style="color:#ABABAB;">arm:</span> <b>46,394</b> ops/s
122+
<code>float64</code><br/>
123+
<span style="color:#ABABAB;">x86:</span> <b>212,421</b> &centerdot;
124+
<span style="color:#ABABAB;">arm:</span> <b>52,904</b> ops/s
125+
</td>
126+
<td align="center"> <!-- serial -->
127+
<code>int8</code><br/>
128+
<span style="color:#ABABAB;">x86:</span> <b>10,548,600</b> &centerdot;
129+
<span style="color:#ABABAB;">arm:</span> <b>11,379,300</b> ops/s
130+
<code>bfloat16</code><br/>
131+
<span style="color:#ABABAB;">x86:</span> <b>119,835</b> &centerdot;
132+
<span style="color:#ABABAB;">arm:</span> <b>403,909</b> ops/s
133+
<code>float16</code><br/>
134+
<span style="color:#ABABAB;">x86:</span> <b>501,310</b> &centerdot;
135+
<span style="color:#ABABAB;">arm:</span> <b>871,963</b> ops/s
136+
<code>float32</code><br/>
137+
<span style="color:#ABABAB;">x86:</span> <b>882,484</b> &centerdot;
138+
<span style="color:#ABABAB;">arm:</span> <b>399,661</b> ops/s
139+
<code>float64</code><br/>
140+
<span style="color:#ABABAB;">x86:</span> <b>839,301</b> &centerdot;
141+
<span style="color:#ABABAB;">arm:</span> <b>837,126</b> ops/s
142+
</td>
143+
<td align="center"> <!-- simsimd -->
144+
<code>int8</code><br/>
145+
<span style="color:#ABABAB;">x86:</span> <b>16,151,800</b> &centerdot;
146+
<span style="color:#ABABAB;">arm:</span> <b>13,524,000</b> ops/s
147+
<code>bfloat16</code><br/>
148+
<span style="color:#ABABAB;">x86:</span> <b>9,738,540</b> &centerdot;
149+
<span style="color:#ABABAB;">arm:</span> <b>4,881,900</b> ops/s
150+
<code>float16</code><br/>
151+
<span style="color:#ABABAB;">x86:</span> <b>7,627,600</b> &centerdot;
152+
<span style="color:#ABABAB;">arm:</span> <b>3,316,810</b> ops/s
153+
<code>float32</code><br/>
154+
<span style="color:#ABABAB;">x86:</span> <b>8,202,910</b> &centerdot;
155+
<span style="color:#ABABAB;">arm:</span> <b>3,400,620</b> ops/s
156+
<code>float64</code><br/>
157+
<span style="color:#ABABAB;">x86:</span> <b>1,538,530</b> &centerdot;
158+
<span style="color:#ABABAB;">arm:</span> <b>1,678,920</b> ops/s
159+
</td>
160+
</tr>
161+
<!-- Euclidean distance with different precision level -->
162+
<tr>
163+
<td colspan="4" align="center">eculidean distance between 1536d vectors in <code>float16</code></td>
164+
</tr>
165+
<tr>
166+
<td align="center"> <!-- scipy.spatial.distance.sqeuclidean -->
167+
<code>int8</code><br/>
168+
<span style="color:#ABABAB;">x86:</span> <b>252,113</b> &centerdot;
169+
<span style="color:#ABABAB;">arm:</span> <b>177,443</b> ops/s
170+
<code>bfloat16</code><br/>
171+
🚧 not supported<br/>
172+
<code>float16</code><br/>
173+
<span style="color:#ABABAB;">x86:</span> <b>54,621</b> &centerdot;
174+
<span style="color:#ABABAB;">arm:</span> <b>71,793</b> ops/s
175+
<code>float32</code><br/>
176+
<span style="color:#ABABAB;">x86:</span> <b>424,944</b> &centerdot;
177+
<span style="color:#ABABAB;">arm:</span> <b>292,629</b> ops/s
178+
<code>float64</code><br/>
179+
<span style="color:#ABABAB;">x86:</span> <b>334,929</b> &centerdot;
180+
<span style="color:#ABABAB;">arm:</span> <b>237,505</b> ops/s
181+
</td>
182+
<td align="center"> <!-- serial -->
183+
<code>int8</code><br/>
184+
<span style="color:#ABABAB;">x86:</span> <b>6,690,110</b> &centerdot;
185+
<span style="color:#ABABAB;">arm:</span> <b>4,114,160</b> ops/s
186+
<code>bfloat16</code><br/>
187+
<span style="color:#ABABAB;">x86:</span> <b>119,842</b> &centerdot;
188+
<span style="color:#ABABAB;">arm:</span> <b>1,049,230</b> ops/s
189+
<code>float16</code><br/>
190+
<span style="color:#ABABAB;">x86:</span> <b>196,413</b> &centerdot;
191+
<span style="color:#ABABAB;">arm:</span> <b>911,370</b> ops/s
192+
<code>float32</code><br/>
193+
<span style="color:#ABABAB;">x86:</span> <b>1,295,210</b> &centerdot;
194+
<span style="color:#ABABAB;">arm:</span> <b>1,055,940</b> ops/s
195+
<code>float64</code><br/>
196+
<span style="color:#ABABAB;">x86:</span> <b>1,215,190</b> &centerdot;
197+
<span style="color:#ABABAB;">arm:</span> <b>905,782</b> ops/s
198+
</td>
199+
<td align="center"> <!-- simsimd -->
200+
<code>int8</code><br/>
201+
<span style="color:#ABABAB;">x86:</span> <b>18,989,000</b> &centerdot;
202+
<span style="color:#ABABAB;">arm:</span> <b>18,878,200</b> ops/s
203+
<code>bfloat16</code><br/>
204+
<span style="color:#ABABAB;">x86:</span> <b>9,727,210</b> &centerdot;
205+
<span style="color:#ABABAB;">arm:</span> <b>4,233,420</b> ops/s
206+
<code>float16</code><br/>
207+
<span style="color:#ABABAB;">x86:</span> <b>19,466,800</b> &centerdot;
208+
<span style="color:#ABABAB;">arm:</span> <b>3,522,760</b> ops/s
209+
<code>float32</code><br/>
210+
<span style="color:#ABABAB;">x86:</span> <b>8,924,100</b> &centerdot;
211+
<span style="color:#ABABAB;">arm:</span> <b>3,602,650</b> ops/s
212+
<code>float64</code><br/>
213+
<span style="color:#ABABAB;">x86:</span> <b>1,701,740</b> &centerdot;
214+
<span style="color:#ABABAB;">arm:</span> <b>1,735,840</b> ops/s
215+
</td>
216+
</tr>
217+
<!-- Bilinear forms -->
218+
<!-- Sparse set intersections -->
219+
</table>
97220

98-
| Type | Apple M2 Pro | Intel Sapphire Rapids | AWS Graviton 4 |
99-
| :--------- | ----------------------------: | -------------------------------: | ------------------------------: |
100-
| `float64` | 18.5 → 28.8 GB/s <br/> + 56 % | 21.9 → 41.4 GB/s <br/> + 89 % | 20.7 → 41.3 GB/s <br/> + 99 % |
101-
| `float32` | 9.2 → 29.6 GB/s <br/> + 221 % | 10.9 → 95.8 GB/s <br/> + 779 % | 4.9 → 41.9 GB/s <br/> + 755 % |
102-
| `float16` | 4.6 → 14.6 GB/s <br/> + 217 % | 3.1 → 108.4 GB/s <br/> + 3,397 % | 5.4 → 39.3 GB/s <br/> + 627 % |
103-
| `bfloat16` | 4.6 → 26.3 GB/s <br/> + 472 % | 0.8 → 59.5 GB/s <br/> +7,437 % | 2.5 → 29.9 GB/s <br/> + 1,096 % |
104-
| `int8` | 25.8 → 47.1 GB/s <br/> + 83 % | 33.1 → 65.3 GB/s <br/> + 97 % | 35.2 → 43.5 GB/s <br/> + 24 % |
105-
| `uint8` | | 32.5 → 66.5 GB/s <br/> + 105 % | |
221+
> The code was compiled with GCC 12, using glibc v2.35.
222+
> The benchmarks performed on Arm-based Graviton3 AWS `c7g` instances and `r7iz` Intel Sapphire Rapids.
223+
> Most modern Arm-based 64-bit CPUs will have similar relative speedups.
224+
> Variance withing x86 CPUs will be larger.
106225
107226
Similar speedups are often observed even when compared to BLAS and LAPACK libraries underlying most numerical computing libraries, including NumPy and SciPy in Python.
108227
Broader benchmarking results:
@@ -689,23 +808,29 @@ To override compilation settings and switch between runtime and compile-time dis
689808
#include <simsimd/simsimd.h>
690809
691810
int main() {
811+
simsimd_i8_t i8[1536];
812+
simsimd_i8_t u8[1536];
692813
simsimd_f64_t f64s[1536];
693814
simsimd_f32_t f32s[1536];
694815
simsimd_f16_t f16s[1536];
695-
simsimd_i8_t i8[1536];
816+
simsimd_bf16_t bf16s[1536];
696817
simsimd_distance_t distance;
697818
698819
// Cosine distance between two vectors
699820
simsimd_cos_i8(i8s, i8s, 1536, &distance);
821+
simsimd_cos_u8(u8s, u8s, 1536, &distance);
700822
simsimd_cos_f16(f16s, f16s, 1536, &distance);
701823
simsimd_cos_f32(f32s, f32s, 1536, &distance);
702824
simsimd_cos_f64(f64s, f64s, 1536, &distance);
825+
simsimd_cos_bf16(bf16s, bf16s, 1536, &distance);
703826
704827
// Euclidean distance between two vectors
705828
simsimd_l2sq_i8(i8s, i8s, 1536, &distance);
829+
simsimd_l2sq_u8(u8s, u8s, 1536, &distance);
706830
simsimd_l2sq_f16(f16s, f16s, 1536, &distance);
707831
simsimd_l2sq_f32(f32s, f32s, 1536, &distance);
708832
simsimd_l2sq_f64(f64s, f64s, 1536, &distance);
833+
simsimd_l2sq_bf16(bf16s, bf16s, 1536, &distance);
709834
710835
return 0;
711836
}
@@ -717,25 +842,41 @@ int main() {
717842
#include <simsimd/simsimd.h>
718843

719844
int main() {
720-
simsimd_f64_t f64s[1536];
721-
simsimd_f32_t f32s[1536];
845+
// SimSIMD provides "sized" type-aliases without relying on `stdint.h`
846+
simsimd_i8_t i8[1536];
847+
simsimd_i8_t u8[1536];
722848
simsimd_f16_t f16s[1536];
723-
simsimd_distance_t distance;
724-
725-
// Inner product between two vectors
726-
simsimd_dot_f16(f16s, f16s, 1536, &distance);
727-
simsimd_dot_f32(f32s, f32s, 1536, &distance);
728-
simsimd_dot_f64(f64s, f64s, 1536, &distance);
849+
simsimd_f32_t f32s[1536];
850+
simsimd_f64_t f64s[1536];
851+
simsimd_bf16_t bf16s[1536];
852+
simsimd_distance_t product;
853+
854+
// Inner product between two real vectors
855+
simsimd_dot_i8(i8s, i8s, 1536, &product);
856+
simsimd_dot_u8(u8s, u8s, 1536, &product);
857+
simsimd_dot_f16(f16s, f16s, 1536, &product);
858+
simsimd_dot_f32(f32s, f32s, 1536, &product);
859+
simsimd_dot_f64(f64s, f64s, 1536, &product);
860+
simsimd_dot_bf16(bf16s, bf16s, 1536, &product);
861+
862+
// SimSIMD provides complex types with `real` and `imag` fields
863+
simsimd_f64c_t f64s[768];
864+
simsimd_f32c_t f32s[768];
865+
simsimd_f16c_t f16s[768];
866+
simsimd_bf16c_t bf16s[768];
867+
simsimd_distance_t products[2]; // real and imaginary parts
729868

730869
// Complex inner product between two vectors
731-
simsimd_dot_f16c(f16s, f16s, 1536, &distance);
732-
simsimd_dot_f32c(f32s, f32s, 1536, &distance);
733-
simsimd_dot_f64c(f64s, f64s, 1536, &distance);
870+
simsimd_dot_f16c(f16cs, f16cs, 768, &products[0]);
871+
simsimd_dot_f32c(f32cs, f32cs, 768, &products[0]);
872+
simsimd_dot_f64c(f64cs, f64cs, 768, &products[0]);
873+
simsimd_dot_bf16c(bf16cs, bf16cs, 768, &products[0]);
734874

735875
// Complex conjugate inner product between two vectors
736-
simsimd_vdot_f16c(f16s, f16s, 1536, &distance);
737-
simsimd_vdot_f32c(f32s, f32s, 1536, &distance);
738-
simsimd_vdot_f64c(f64s, f64s, 1536, &distance);
876+
simsimd_vdot_f16c(f16cs, f16cs, 768, &products[0]);
877+
simsimd_vdot_f32c(f32cs, f32cs, 768, &products[0]);
878+
simsimd_vdot_f64c(f64cs, f64cs, 768, &products[0]);
879+
simsimd_vdot_bf16c(bf16cs, bf16cs, 768, &products[0]);
739880
return 0;
740881
}
741882
```
@@ -763,17 +904,17 @@ int main() {
763904
simsimd_f64_t f64s[1536];
764905
simsimd_f32_t f32s[1536];
765906
simsimd_f16_t f16s[1536];
766-
simsimd_distance_t distance;
907+
simsimd_distance_t divergence;
767908

768909
// Jensen-Shannon divergence between two vectors
769-
simsimd_js_f16(f16s, f16s, 1536, &distance);
770-
simsimd_js_f32(f32s, f32s, 1536, &distance);
771-
simsimd_js_f64(f64s, f64s, 1536, &distance);
910+
simsimd_js_f16(f16s, f16s, 1536, &divergence);
911+
simsimd_js_f32(f32s, f32s, 1536, &divergence);
912+
simsimd_js_f64(f64s, f64s, 1536, &divergence);
772913

773914
// Kullback-Leibler divergence between two vectors
774-
simsimd_kl_f16(f16s, f16s, 1536, &distance);
775-
simsimd_kl_f32(f32s, f32s, 1536, &distance);
776-
simsimd_kl_f64(f64s, f64s, 1536, &distance);
915+
simsimd_kl_f16(f16s, f16s, 1536, &divergence);
916+
simsimd_kl_f32(f32s, f32s, 1536, &divergence);
917+
simsimd_kl_f64(f64s, f64s, 1536, &divergence);
777918
return 0;
778919
}
779920
```

0 commit comments

Comments
 (0)