Improve: Free threading examples & checks

ashvardanian · ashvardanian · commit 83e522afbd1f · 2025-07-06T13:28:43.000Z
diff --git a/README.md b/README.md
@@ -491,7 +491,7 @@ slow_output_color = (alpha * light_intensity * diffuse_component + beta * specul
 
 By default, computations use a single CPU core.
 To override this behavior, use the `threads` argument.
-Set it to `0` to use all available CPU cores.
+Set it to `0` to use all available CPU cores and let the underlying C library manage the thread pool.
 Here is an example of dealing with large sets of binary vectors:
 
 ```py
@@ -507,9 +507,42 @@ distances = simsimd.cdist(matrix1, matrix2,
 )
 ```
 
+Alternatively, when using free-threading Python 3.13t builds, one can combine single-threaded SimSIMD operations with Python's `concurrent.futures.ThreadPoolExecutor` to parallelize the computations.
 By default, the output distances will be stored in double-precision `float64` floating-point numbers.
 That behavior may not be space-efficient, especially if you are computing the hamming distance between short binary vectors, that will generally fit into 8x smaller `uint8` or `uint16` types.
-To override this behavior, use the `dtype` argument.
+To override this behavior, use the `out_dtype` argument, or consider pre-allocating the output array and passing it to the `out` argument.
+A more complete example may look like this:
+
+```py
+from multiprocessing import cpu_count
+from concurrent.futures import ThreadPoolExecutor
+from simsimd import cosine
+import numpy as np
+
+# Generate large dataset
+vectors_a = np.random.rand(100_000, 1536).astype(np.float32)
+vectors_b = np.random.rand(100_000, 1536).astype(np.float32)
+distances = np.zeros((100_000,), dtype=np.float32)
+
+def compute_batch(start_idx, end_idx):
+    batch_a = vectors_a[start_idx:end_idx]
+    batch_b = vectors_b[start_idx:end_idx]
+    cosine(batch_a, batch_b, out=distances[start_idx:end_idx])
+
+# Use all CPU cores with true parallelism (no GIL!)
+num_threads = cpu_count()
+chunk_size = len(vectors_a) // num_threads
+
+with ThreadPoolExecutor(max_workers=num_threads) as executor:
+    futures = []
+    for i in range(num_threads):
+        start_idx = i * chunk_size
+        end_idx = (i + 1) * chunk_size if i < num_threads - 1 else len(vectors_a)
+        futures.append(executor.submit(compute_batch, start_idx, end_idx))
+
+    # Collect results from all threads
+    results = [future.result() for future in futures]            
+```                                                               
 
 ### Helper Functions
 
diff --git a/scripts/test.py b/scripts/test.py
@@ -4,8 +4,8 @@
 Module: test.py
 
 This module contains a suite of tests for the `simsimd` package.
-It compares various SIMD kernels (like Dot-products, squared Euclidean, and Cosine distances) 
-with their NumPy or baseline counterparts, testing accuracy for different data types including 
+It compares various SIMD kernels (like Dot-products, squared Euclidean, and Cosine distances)
+with their NumPy or baseline counterparts, testing accuracy for different data types including
 floating-point, integer, and complex numbers.
 
 The tests cover:
@@ -1421,31 +1421,36 @@ def test_cdist_hamming(ndim, out_dtype, capability):
 def test_gil_free_threading():
     """Test SimSIMD in Python 3.13t free-threaded mode if available."""
     import sys
-    
+    import sysconfig
+
     # Check if we're in a GIL-free environment
+    # https://py-free-threading.github.io/running-gil-disabled/
     version = sys.version_info
     if version.major == 3 and version.minor >= 13:
-        if sys._is_gil_enabled():        
-            pytest.skip("GIL is enabled, skipping GIL-free threading test")
+        is_free_threaded = bool(sysconfig.get_config_var("Py_GIL_DISABLED"))
+        if not is_free_threaded:
+            pytest.skip("Uses non-free-threaded Python, skipping GIL-related tests")
+        if sys._is_gil_enabled():
+            pytest.skip("GIL is enabled, skipping GIL-related tests")
     else:
-        pytest.skip("Python < 3.13t, skipping GIL-free threading test")
-    
+        pytest.skip("Python < 3.13t, skipping GIL-related tests")
+
     import multiprocessing
     import concurrent.futures
 
     num_threads = multiprocessing.cpu_count()
     vectors_a = np.random.rand(32 * 1024 * num_threads, 1024).astype(np.float32)
     vectors_b = np.random.rand(32 * 1024 * num_threads, 1024).astype(np.float32)
     distances = np.zeros(vectors_a.shape[0], dtype=np.float32)
-    
+
     def compute_batch(start_idx, end_idx) -> float:
         """Compute cosine distances for a batch."""
         slice_a = vectors_a[start_idx:end_idx]
         slice_b = vectors_b[start_idx:end_idx]
         slice_distances = distances[start_idx:end_idx]
-        simd.cosine(slice_a, slice_b, out=slice_distances)        
+        simd.cosine(slice_a, slice_b, out=slice_distances)
         return sum(slice_distances)
-    
+
     def compute_with_threads(threads: int) -> float:
         """Compute cosine distances using multiple threads."""
         chunk_size = len(vectors_a) // threads
@@ -1455,36 +1460,37 @@ def compute_with_threads(threads: int) -> float:
                 start_idx = i * chunk_size
                 end_idx = (i + 1) * chunk_size if i < threads - 1 else len(vectors_a)
                 futures.append(executor.submit(compute_batch, start_idx, end_idx))
-        
+
         total_sum = 0.0
         for future in concurrent.futures.as_completed(futures):
             total_sum += future.result()
-        
+
         return total_sum
-    
+
     # Dual-threaded baseline is better than single-threaded,
     # as it will include the overhead of thread management.
     start_time = time.time()
     baseline_sum = compute_with_threads(2)
     end_time = time.time()
     baseline_duration = end_time - start_time
-    
+
     # Multi-threaded execution, using all available threads
     start_time = time.time()
     multi_sum = compute_with_threads(num_threads)
     end_time = time.time()
     multi_duration = end_time - start_time
 
     # Verify results are the same length and reasonable
-    assert np.allclose(baseline_sum, multi_sum, atol=SIMSIMD_ATOL, rtol=SIMSIMD_RTOL), \
-        f"Results differ: baseline {baseline_sum} vs multi-threaded {multi_sum}"
+    assert np.allclose(
+        baseline_sum, multi_sum, atol=SIMSIMD_ATOL, rtol=SIMSIMD_RTOL
+    ), f"Results differ: baseline {baseline_sum} vs multi-threaded {multi_sum}"
 
-    # Warn if multi-threaded execution is slower than the baseline    
+    # Warn if multi-threaded execution is slower than the baseline
     if baseline_duration < multi_duration:
         pytest.warns(
             UserWarning,
             f"{num_threads}-threaded execution took longer than 2-threaded baseline: {multi_duration:.2f}s vs {baseline_duration:.2f}s",
-        )        
+        )
 
 
 if __name__ == "__main__":