Skip to content

Commit 1d0d2ac

Browse files
authored
Merge pull request #236 from ashvardanian/main-dev
Infer boolean types in Python
2 parents 364e736 + daa41bd commit 1d0d2ac

File tree

7 files changed

+52
-33
lines changed

7 files changed

+52
-33
lines changed

README.md

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -162,10 +162,22 @@ dist = simsimd.cosine(vec1, vec2, "int8")
162162
dist = simsimd.cosine(vec1, vec2, "float16")
163163
dist = simsimd.cosine(vec1, vec2, "float32")
164164
dist = simsimd.cosine(vec1, vec2, "float64")
165-
dist = simsimd.hamming(vec1, vec2, "bit8")
165+
dist = simsimd.hamming(vec1, vec2, "bin8")
166+
```
167+
168+
Binary distance functions are computed at a bit-level.
169+
Meaning a vector of 10x 8-bit integers will be treated as a sequence of 80 individual bits or dimensions.
170+
This differs from NumPy, that can't handle smaller-than-byte types, but you can still avoid the `bin8` argument by reinterpreting the vector as booleans:
171+
172+
```py
173+
vec1 = np.random.randint(2, size=80).astype(np.uint8).packbits().view(np.bool_)
174+
vec2 = np.random.randint(2, size=80).astype(np.uint8).packbits().view(np.bool_)
175+
hamming_distance = simsimd.hamming(vec1, vec2)
176+
jaccard_distance = simsimd.jaccard(vec1, vec2)
166177
```
167178

168179
With other frameworks, like PyTorch, one can get a richer type-system than NumPy, but the lack of good CPython interoperability makes it hard to pass data without copies.
180+
Here is an example of using SimSIMD with PyTorch to compute the cosine similarity between two `bfloat16` vectors:
169181

170182
```py
171183
import numpy as np
@@ -181,7 +193,7 @@ torch.randn(8, out=vec2)
181193

182194
# Both libs will look into the same memory buffers and report the same results
183195
dist_slow = 1 - torch.nn.functional.cosine_similarity(vec1, vec2, dim=0)
184-
dist_fast = simsimd.cosine(buf1, buf2, "bf16")
196+
dist_fast = simsimd.cosine(buf1, buf2, "bfloat16")
185197
```
186198

187199
It also allows using SimSIMD for half-precision complex numbers, which NumPy does not support.
@@ -254,9 +266,9 @@ distances: DistancesTensor = simsimd.cdist(matrix1, matrix2, metric="cosine")
254266
distances_array: np.ndarray = np.array(distances, copy=True) # now managed by NumPy
255267
```
256268

257-
### Elementwise Kernels
269+
### Element-wise Kernels
258270

259-
SimSIMD also provides mixed-precision elementwise kernels, where the input vectors and the output have the same numeric type, but the intermediate accumulators are of a higher precision.
271+
SimSIMD also provides mixed-precision element-wise kernels, where the input vectors and the output have the same numeric type, but the intermediate accumulators are of a higher precision.
260272

261273
```py
262274
import numpy as np

include/simsimd/binary.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -89,10 +89,10 @@ SIMSIMD_INTERNAL simsimd_u32_t _simsimd_reduce_u8x16_neon(uint8x16_t vec) {
8989
// Sum the widened halves
9090
uint16x8_t sum16 = vaddq_u16(low_half, high_half);
9191

92-
// Now reduce the `uint16x8_t` to a single `uint32_t`
93-
uint32x4_t sum32 = vpaddlq_u16(sum16); // pairwise add into 32-bit integers
94-
uint64x2_t sum64 = vpaddlq_u32(sum32); // pairwise add into 64-bit integers
95-
uint32_t final_sum = vaddvq_u64(sum64); // final horizontal add to 32-bit result
92+
// Now reduce the `uint16x8_t` to a single `simsimd_u32_t`
93+
uint32x4_t sum32 = vpaddlq_u16(sum16); // pairwise add into 32-bit integers
94+
uint64x2_t sum64 = vpaddlq_u32(sum32); // pairwise add into 64-bit integers
95+
simsimd_u32_t final_sum = vaddvq_u64(sum64); // final horizontal add to 32-bit result
9696
return final_sum;
9797
}
9898

include/simsimd/dot.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -362,9 +362,9 @@ SIMSIMD_PUBLIC void simsimd_dot_i8_neon(simsimd_i8_t const *a, simsimd_i8_t cons
362362
}
363363

364364
// Take care of the tail:
365-
int32_t ab = vaddvq_s32(ab_vec);
365+
simsimd_i32_t ab = vaddvq_s32(ab_vec);
366366
for (; i < n; ++i) {
367-
int32_t ai = a[i], bi = b[i];
367+
simsimd_i32_t ai = a[i], bi = b[i];
368368
ab += ai * bi;
369369
}
370370

@@ -383,9 +383,9 @@ SIMSIMD_PUBLIC void simsimd_dot_u8_neon(simsimd_u8_t const *a, simsimd_u8_t cons
383383
}
384384

385385
// Take care of the tail:
386-
uint32_t ab = vaddvq_u32(ab_vec);
386+
simsimd_u32_t ab = vaddvq_u32(ab_vec);
387387
for (; i < n; ++i) {
388-
uint32_t ai = a[i], bi = b[i];
388+
simsimd_u32_t ai = a[i], bi = b[i];
389389
ab += ai * bi;
390390
}
391391

include/simsimd/simsimd.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -434,7 +434,7 @@ SIMSIMD_PUBLIC simsimd_capability_t _simsimd_capabilities_x86(void) {
434434
SIMSIMD_PUBLIC simsimd_capability_t _simsimd_capabilities_arm(void) {
435435
#if defined(_SIMSIMD_DEFINED_APPLE)
436436
// On Apple Silicon, `mrs` is not allowed in user-space, so we need to use the `sysctl` API.
437-
uint32_t supports_neon = 0, supports_fp16 = 0, supports_bf16 = 0, supports_i8mm = 0;
437+
unsigned supports_neon = 0, supports_fp16 = 0, supports_bf16 = 0, supports_i8mm = 0;
438438
size_t size = sizeof(supports_neon);
439439
if (sysctlbyname("hw.optional.neon", &supports_neon, &size, NULL, 0) != 0) supports_neon = 0;
440440
if (sysctlbyname("hw.optional.arm.FEAT_FP16", &supports_fp16, &size, NULL, 0) != 0) supports_fp16 = 0;

include/simsimd/spatial.h

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -595,10 +595,10 @@ SIMSIMD_PUBLIC void simsimd_l2sq_i8_neon(simsimd_i8_t const *a, simsimd_i8_t con
595595
uint8x16_t d_vec = vreinterpretq_u8_s8(vabdq_s8(a_vec, b_vec));
596596
d2_vec = vdotq_u32(d2_vec, d_vec, d_vec);
597597
}
598-
uint32_t d2 = vaddvq_u32(d2_vec);
598+
simsimd_u32_t d2 = vaddvq_u32(d2_vec);
599599
for (; i < n; ++i) {
600-
int32_t n = (int32_t)a[i] - b[i];
601-
d2 += (uint32_t)(n * n);
600+
simsimd_i32_t n = (simsimd_i32_t)a[i] - b[i];
601+
d2 += (simsimd_u32_t)(n * n);
602602
}
603603
*result = d2;
604604
}
@@ -693,9 +693,9 @@ SIMSIMD_PUBLIC void simsimd_cos_i8_neon(simsimd_i8_t const *a, simsimd_i8_t cons
693693
// products_high_vec = vmmlaq_s32(products_high_vec, v_vec, y_w_vecs.val[1]);
694694
// }
695695
// int32x4_t products_vec = vaddq_s32(products_high_vec, products_low_vec);
696-
// int32_t a2 = products_vec[0];
697-
// int32_t ab = products_vec[1];
698-
// int32_t b2 = products_vec[3];
696+
// simsimd_i32_t a2 = products_vec[0];
697+
// simsimd_i32_t ab = products_vec[1];
698+
// simsimd_i32_t b2 = products_vec[3];
699699
//
700700
// That solution is elegant, but it requires the additional `+i8mm` extension and is currently slower,
701701
// at least on AWS Graviton 3.
@@ -709,13 +709,13 @@ SIMSIMD_PUBLIC void simsimd_cos_i8_neon(simsimd_i8_t const *a, simsimd_i8_t cons
709709
a2_vec = vdotq_s32(a2_vec, a_vec, a_vec);
710710
b2_vec = vdotq_s32(b2_vec, b_vec, b_vec);
711711
}
712-
int32_t ab = vaddvq_s32(ab_vec);
713-
int32_t a2 = vaddvq_s32(a2_vec);
714-
int32_t b2 = vaddvq_s32(b2_vec);
712+
simsimd_i32_t ab = vaddvq_s32(ab_vec);
713+
simsimd_i32_t a2 = vaddvq_s32(a2_vec);
714+
simsimd_i32_t b2 = vaddvq_s32(b2_vec);
715715

716716
// Take care of the tail:
717717
for (; i < n; ++i) {
718-
int32_t ai = a[i], bi = b[i];
718+
simsimd_i32_t ai = a[i], bi = b[i];
719719
ab += ai * bi, a2 += ai * ai, b2 += bi * bi;
720720
}
721721

@@ -737,10 +737,10 @@ SIMSIMD_PUBLIC void simsimd_l2sq_u8_neon(simsimd_u8_t const *a, simsimd_u8_t con
737737
uint8x16_t d_vec = vabdq_u8(a_vec, b_vec);
738738
d2_vec = vdotq_u32(d2_vec, d_vec, d_vec);
739739
}
740-
uint32_t d2 = vaddvq_u32(d2_vec);
740+
simsimd_u32_t d2 = vaddvq_u32(d2_vec);
741741
for (; i < n; ++i) {
742-
int32_t n = (int32_t)a[i] - b[i];
743-
d2 += (uint32_t)(n * n);
742+
simsimd_i32_t n = (simsimd_i32_t)a[i] - b[i];
743+
d2 += (simsimd_u32_t)(n * n);
744744
}
745745
*result = d2;
746746
}
@@ -759,13 +759,13 @@ SIMSIMD_PUBLIC void simsimd_cos_u8_neon(simsimd_u8_t const *a, simsimd_u8_t cons
759759
a2_vec = vdotq_u32(a2_vec, a_vec, a_vec);
760760
b2_vec = vdotq_u32(b2_vec, b_vec, b_vec);
761761
}
762-
uint32_t ab = vaddvq_u32(ab_vec);
763-
uint32_t a2 = vaddvq_u32(a2_vec);
764-
uint32_t b2 = vaddvq_u32(b2_vec);
762+
simsimd_u32_t ab = vaddvq_u32(ab_vec);
763+
simsimd_u32_t a2 = vaddvq_u32(a2_vec);
764+
simsimd_u32_t b2 = vaddvq_u32(b2_vec);
765765

766766
// Take care of the tail:
767767
for (; i < n; ++i) {
768-
uint32_t ai = a[i], bi = b[i];
768+
simsimd_u32_t ai = a[i], bi = b[i];
769769
ab += ai * bi, a2 += ai * ai, b2 += bi * bi;
770770
}
771771

@@ -1050,7 +1050,7 @@ SIMSIMD_INTERNAL simsimd_distance_t _simsimd_cos_normalize_f32_haswell(simsimd_f
10501050
// Load the squares into an __m128 register for single-precision floating-point operations
10511051
__m128 squares = _mm_set_ps(a2, b2, a2, b2); // We replicate to make use of full register
10521052

1053-
// Compute the reciprocal square root of the squares using _mm_rsqrt_ps (single-precision)
1053+
// Compute the reciprocal square root of the squares using `_mm_rsqrt_ps` (single-precision)
10541054
__m128 rsqrts = _mm_rsqrt_ps(squares);
10551055

10561056
// Perform one iteration of Newton-Raphson refinement to improve the precision of rsqrt:

python/lib.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ simsimd_datatype_t python_string_to_datatype(char const *name) {
190190

191191
//! Boolean values:
192192
else if (same_string(name, "bin8") || // SimSIMD-specific
193-
same_string(name, "c")) // Named type
193+
same_string(name, "?")) // Named type
194194
return simsimd_datatype_b8_k;
195195

196196
// Signed integers:
@@ -276,7 +276,7 @@ char const *datatype_to_python_string(simsimd_datatype_t dtype) {
276276
case simsimd_datatype_f32c_k: return "Zf";
277277
case simsimd_datatype_f16c_k: return "Ze";
278278
// Boolean values:
279-
case simsimd_datatype_b8_k: return "c";
279+
case simsimd_datatype_b8_k: return "?";
280280
// Signed integers:
281281
case simsimd_datatype_i8_k: return "b";
282282
case simsimd_datatype_i16_k: return "h";

scripts/test.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -839,6 +839,13 @@ def test_dense_bits(ndim, metric, capability, stats_fixture):
839839
np.testing.assert_allclose(result, expected, atol=SIMSIMD_ATOL, rtol=SIMSIMD_RTOL)
840840
collect_errors(metric, ndim, "bin8", accurate, accurate_dt, expected, expected_dt, result, result_dt, stats_fixture)
841841

842+
# Aside from overriding the `dtype` parameter, we can also view as booleans
843+
result_dt, result = profile(simd_kernel, np.packbits(a).view(np.bool_), np.packbits(b).view(np.bool_))
844+
result = np.array(result)
845+
846+
np.testing.assert_allclose(result, expected, atol=SIMSIMD_ATOL, rtol=SIMSIMD_RTOL)
847+
collect_errors(metric, ndim, "bin8", accurate, accurate_dt, expected, expected_dt, result, result_dt, stats_fixture)
848+
842849

843850
@pytest.mark.skipif(not numpy_available, reason="NumPy is not installed")
844851
@pytest.mark.skipif(not scipy_available, reason="SciPy is not installed")

0 commit comments

Comments
 (0)