Merge pull request #236 from ashvardanian/main-dev

ashvardanian · web-flow · commit 1d0d2aca9416 · 2024-11-19T11:20:33.000Z
Infer boolean types in Python
diff --git a/README.md b/README.md
@@ -162,10 +162,22 @@ dist = simsimd.cosine(vec1, vec2, "int8")
 dist = simsimd.cosine(vec1, vec2, "float16")
 dist = simsimd.cosine(vec1, vec2, "float32")
 dist = simsimd.cosine(vec1, vec2, "float64")
-dist = simsimd.hamming(vec1, vec2, "bit8")
+dist = simsimd.hamming(vec1, vec2, "bin8")
+```
+
+Binary distance functions are computed at a bit-level.
+Meaning a vector of 10x 8-bit integers will be treated as a sequence of 80 individual bits or dimensions.
+This differs from NumPy, that can't handle smaller-than-byte types, but you can still avoid the `bin8` argument by reinterpreting the vector as booleans:
+
+```py
+vec1 = np.random.randint(2, size=80).astype(np.uint8).packbits().view(np.bool_)
+vec2 = np.random.randint(2, size=80).astype(np.uint8).packbits().view(np.bool_)
+hamming_distance = simsimd.hamming(vec1, vec2)
+jaccard_distance = simsimd.jaccard(vec1, vec2)
 ```
 
 With other frameworks, like PyTorch, one can get a richer type-system than NumPy, but the lack of good CPython interoperability makes it hard to pass data without copies.
+Here is an example of using SimSIMD with PyTorch to compute the cosine similarity between two `bfloat16` vectors:
 
 ```py
 import numpy as np
@@ -181,7 +193,7 @@ torch.randn(8, out=vec2)
 
 # Both libs will look into the same memory buffers and report the same results
 dist_slow = 1 - torch.nn.functional.cosine_similarity(vec1, vec2, dim=0)
-dist_fast = simsimd.cosine(buf1, buf2, "bf16")
+dist_fast = simsimd.cosine(buf1, buf2, "bfloat16")
 ```
 
 It also allows using SimSIMD for half-precision complex numbers, which NumPy does not support.
@@ -254,9 +266,9 @@ distances: DistancesTensor = simsimd.cdist(matrix1, matrix2, metric="cosine")
 distances_array: np.ndarray = np.array(distances, copy=True)                    # now managed by NumPy
 ```
 
-### Elementwise Kernels
+### Element-wise Kernels
 
-SimSIMD also provides mixed-precision elementwise kernels, where the input vectors and the output have the same numeric type, but the intermediate accumulators are of a higher precision.
+SimSIMD also provides mixed-precision element-wise kernels, where the input vectors and the output have the same numeric type, but the intermediate accumulators are of a higher precision.
 
 ```py
 import numpy as np
diff --git a/include/simsimd/binary.h b/include/simsimd/binary.h
@@ -89,10 +89,10 @@ SIMSIMD_INTERNAL simsimd_u32_t _simsimd_reduce_u8x16_neon(uint8x16_t vec) {
     // Sum the widened halves
     uint16x8_t sum16 = vaddq_u16(low_half, high_half);
 
-    // Now reduce the `uint16x8_t` to a single `uint32_t`
-    uint32x4_t sum32 = vpaddlq_u16(sum16);  // pairwise add into 32-bit integers
-    uint64x2_t sum64 = vpaddlq_u32(sum32);  // pairwise add into 64-bit integers
-    uint32_t final_sum = vaddvq_u64(sum64); // final horizontal add to 32-bit result
+    // Now reduce the `uint16x8_t` to a single `simsimd_u32_t`
+    uint32x4_t sum32 = vpaddlq_u16(sum16);       // pairwise add into 32-bit integers
+    uint64x2_t sum64 = vpaddlq_u32(sum32);       // pairwise add into 64-bit integers
+    simsimd_u32_t final_sum = vaddvq_u64(sum64); // final horizontal add to 32-bit result
     return final_sum;
 }
 
diff --git a/include/simsimd/dot.h b/include/simsimd/dot.h
@@ -362,9 +362,9 @@ SIMSIMD_PUBLIC void simsimd_dot_i8_neon(simsimd_i8_t const *a, simsimd_i8_t cons
     }
 
     // Take care of the tail:
-    int32_t ab = vaddvq_s32(ab_vec);
+    simsimd_i32_t ab = vaddvq_s32(ab_vec);
     for (; i < n; ++i) {
-        int32_t ai = a[i], bi = b[i];
+        simsimd_i32_t ai = a[i], bi = b[i];
         ab += ai * bi;
     }
 
@@ -383,9 +383,9 @@ SIMSIMD_PUBLIC void simsimd_dot_u8_neon(simsimd_u8_t const *a, simsimd_u8_t cons
     }
 
     // Take care of the tail:
-    uint32_t ab = vaddvq_u32(ab_vec);
+    simsimd_u32_t ab = vaddvq_u32(ab_vec);
     for (; i < n; ++i) {
-        uint32_t ai = a[i], bi = b[i];
+        simsimd_u32_t ai = a[i], bi = b[i];
         ab += ai * bi;
     }
 
diff --git a/include/simsimd/simsimd.h b/include/simsimd/simsimd.h
@@ -434,7 +434,7 @@ SIMSIMD_PUBLIC simsimd_capability_t _simsimd_capabilities_x86(void) {
 SIMSIMD_PUBLIC simsimd_capability_t _simsimd_capabilities_arm(void) {
 #if defined(_SIMSIMD_DEFINED_APPLE)
     // On Apple Silicon, `mrs` is not allowed in user-space, so we need to use the `sysctl` API.
-    uint32_t supports_neon = 0, supports_fp16 = 0, supports_bf16 = 0, supports_i8mm = 0;
+    unsigned supports_neon = 0, supports_fp16 = 0, supports_bf16 = 0, supports_i8mm = 0;
     size_t size = sizeof(supports_neon);
     if (sysctlbyname("hw.optional.neon", &supports_neon, &size, NULL, 0) != 0) supports_neon = 0;
     if (sysctlbyname("hw.optional.arm.FEAT_FP16", &supports_fp16, &size, NULL, 0) != 0) supports_fp16 = 0;
diff --git a/include/simsimd/spatial.h b/include/simsimd/spatial.h
@@ -595,10 +595,10 @@ SIMSIMD_PUBLIC void simsimd_l2sq_i8_neon(simsimd_i8_t const *a, simsimd_i8_t con
         uint8x16_t d_vec = vreinterpretq_u8_s8(vabdq_s8(a_vec, b_vec));
         d2_vec = vdotq_u32(d2_vec, d_vec, d_vec);
     }
-    uint32_t d2 = vaddvq_u32(d2_vec);
+    simsimd_u32_t d2 = vaddvq_u32(d2_vec);
     for (; i < n; ++i) {
-        int32_t n = (int32_t)a[i] - b[i];
-        d2 += (uint32_t)(n * n);
+        simsimd_i32_t n = (simsimd_i32_t)a[i] - b[i];
+        d2 += (simsimd_u32_t)(n * n);
     }
     *result = d2;
 }
@@ -693,9 +693,9 @@ SIMSIMD_PUBLIC void simsimd_cos_i8_neon(simsimd_i8_t const *a, simsimd_i8_t cons
     //          products_high_vec = vmmlaq_s32(products_high_vec, v_vec, y_w_vecs.val[1]);
     //      }
     //      int32x4_t products_vec = vaddq_s32(products_high_vec, products_low_vec);
-    //      int32_t a2 = products_vec[0];
-    //      int32_t ab = products_vec[1];
-    //      int32_t b2 = products_vec[3];
+    //      simsimd_i32_t a2 = products_vec[0];
+    //      simsimd_i32_t ab = products_vec[1];
+    //      simsimd_i32_t b2 = products_vec[3];
     //
     // That solution is elegant, but it requires the additional `+i8mm` extension and is currently slower,
     // at least on AWS Graviton 3.
@@ -709,13 +709,13 @@ SIMSIMD_PUBLIC void simsimd_cos_i8_neon(simsimd_i8_t const *a, simsimd_i8_t cons
         a2_vec = vdotq_s32(a2_vec, a_vec, a_vec);
         b2_vec = vdotq_s32(b2_vec, b_vec, b_vec);
     }
-    int32_t ab = vaddvq_s32(ab_vec);
-    int32_t a2 = vaddvq_s32(a2_vec);
-    int32_t b2 = vaddvq_s32(b2_vec);
+    simsimd_i32_t ab = vaddvq_s32(ab_vec);
+    simsimd_i32_t a2 = vaddvq_s32(a2_vec);
+    simsimd_i32_t b2 = vaddvq_s32(b2_vec);
 
     // Take care of the tail:
     for (; i < n; ++i) {
-        int32_t ai = a[i], bi = b[i];
+        simsimd_i32_t ai = a[i], bi = b[i];
         ab += ai * bi, a2 += ai * ai, b2 += bi * bi;
     }
 
@@ -737,10 +737,10 @@ SIMSIMD_PUBLIC void simsimd_l2sq_u8_neon(simsimd_u8_t const *a, simsimd_u8_t con
         uint8x16_t d_vec = vabdq_u8(a_vec, b_vec);
         d2_vec = vdotq_u32(d2_vec, d_vec, d_vec);
     }
-    uint32_t d2 = vaddvq_u32(d2_vec);
+    simsimd_u32_t d2 = vaddvq_u32(d2_vec);
     for (; i < n; ++i) {
-        int32_t n = (int32_t)a[i] - b[i];
-        d2 += (uint32_t)(n * n);
+        simsimd_i32_t n = (simsimd_i32_t)a[i] - b[i];
+        d2 += (simsimd_u32_t)(n * n);
     }
     *result = d2;
 }
@@ -759,13 +759,13 @@ SIMSIMD_PUBLIC void simsimd_cos_u8_neon(simsimd_u8_t const *a, simsimd_u8_t cons
         a2_vec = vdotq_u32(a2_vec, a_vec, a_vec);
         b2_vec = vdotq_u32(b2_vec, b_vec, b_vec);
     }
-    uint32_t ab = vaddvq_u32(ab_vec);
-    uint32_t a2 = vaddvq_u32(a2_vec);
-    uint32_t b2 = vaddvq_u32(b2_vec);
+    simsimd_u32_t ab = vaddvq_u32(ab_vec);
+    simsimd_u32_t a2 = vaddvq_u32(a2_vec);
+    simsimd_u32_t b2 = vaddvq_u32(b2_vec);
 
     // Take care of the tail:
     for (; i < n; ++i) {
-        uint32_t ai = a[i], bi = b[i];
+        simsimd_u32_t ai = a[i], bi = b[i];
         ab += ai * bi, a2 += ai * ai, b2 += bi * bi;
     }
 
@@ -1050,7 +1050,7 @@ SIMSIMD_INTERNAL simsimd_distance_t _simsimd_cos_normalize_f32_haswell(simsimd_f
     // Load the squares into an __m128 register for single-precision floating-point operations
     __m128 squares = _mm_set_ps(a2, b2, a2, b2); // We replicate to make use of full register
 
-    // Compute the reciprocal square root of the squares using _mm_rsqrt_ps (single-precision)
+    // Compute the reciprocal square root of the squares using `_mm_rsqrt_ps` (single-precision)
     __m128 rsqrts = _mm_rsqrt_ps(squares);
 
     // Perform one iteration of Newton-Raphson refinement to improve the precision of rsqrt:
diff --git a/python/lib.c b/python/lib.c
@@ -190,7 +190,7 @@ simsimd_datatype_t python_string_to_datatype(char const *name) {
 
     //! Boolean values:
     else if (same_string(name, "bin8") || // SimSIMD-specific
-             same_string(name, "c"))      // Named type
+             same_string(name, "?"))      // Named type
         return simsimd_datatype_b8_k;
 
     // Signed integers:
@@ -276,7 +276,7 @@ char const *datatype_to_python_string(simsimd_datatype_t dtype) {
     case simsimd_datatype_f32c_k: return "Zf";
     case simsimd_datatype_f16c_k: return "Ze";
     // Boolean values:
-    case simsimd_datatype_b8_k: return "c";
+    case simsimd_datatype_b8_k: return "?";
     // Signed integers:
     case simsimd_datatype_i8_k: return "b";
     case simsimd_datatype_i16_k: return "h";
diff --git a/scripts/test.py b/scripts/test.py
@@ -839,6 +839,13 @@ def test_dense_bits(ndim, metric, capability, stats_fixture):
     np.testing.assert_allclose(result, expected, atol=SIMSIMD_ATOL, rtol=SIMSIMD_RTOL)
     collect_errors(metric, ndim, "bin8", accurate, accurate_dt, expected, expected_dt, result, result_dt, stats_fixture)
 
+    # Aside from overriding the `dtype` parameter, we can also view as booleans
+    result_dt, result = profile(simd_kernel, np.packbits(a).view(np.bool_), np.packbits(b).view(np.bool_))
+    result = np.array(result)
+
+    np.testing.assert_allclose(result, expected, atol=SIMSIMD_ATOL, rtol=SIMSIMD_RTOL)
+    collect_errors(metric, ndim, "bin8", accurate, accurate_dt, expected, expected_dt, result, result_dt, stats_fixture)
+
 
 @pytest.mark.skipif(not numpy_available, reason="NumPy is not installed")
 @pytest.mark.skipif(not scipy_available, reason="SciPy is not installed")

Original file line number	Diff line number	Diff line change
`@@ -362,9 +362,9 @@ SIMSIMD_PUBLIC void simsimd_dot_i8_neon(simsimd_i8_t const *a, simsimd_i8_t cons`
`362`	`362`	`}`
`363`	`363`
`364`	`364`	`// Take care of the tail:`
`365`		`- int32_t ab = vaddvq_s32(ab_vec);`
	`365`	`+ simsimd_i32_t ab = vaddvq_s32(ab_vec);`
`366`	`366`	`for (; i < n; ++i) {`
`367`		`- int32_t ai = a[i], bi = b[i];`
	`367`	`+ simsimd_i32_t ai = a[i], bi = b[i];`
`368`	`368`	`ab += ai * bi;`
`369`	`369`	`}`
`370`	`370`
`@@ -383,9 +383,9 @@ SIMSIMD_PUBLIC void simsimd_dot_u8_neon(simsimd_u8_t const *a, simsimd_u8_t cons`
`383`	`383`	`}`
`384`	`384`
`385`	`385`	`// Take care of the tail:`
`386`		`- uint32_t ab = vaddvq_u32(ab_vec);`
	`386`	`+ simsimd_u32_t ab = vaddvq_u32(ab_vec);`
`387`	`387`	`for (; i < n; ++i) {`
`388`		`- uint32_t ai = a[i], bi = b[i];`
	`388`	`+ simsimd_u32_t ai = a[i], bi = b[i];`
`389`	`389`	`ab += ai * bi;`
`390`	`390`	`}`
`391`	`391`