Skip to content

Commit 83e522a

Browse files
committed
Improve: Free threading examples & checks
1 parent 0093c3f commit 83e522a

File tree

2 files changed

+59
-20
lines changed

2 files changed

+59
-20
lines changed

README.md

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -491,7 +491,7 @@ slow_output_color = (alpha * light_intensity * diffuse_component + beta * specul
491491

492492
By default, computations use a single CPU core.
493493
To override this behavior, use the `threads` argument.
494-
Set it to `0` to use all available CPU cores.
494+
Set it to `0` to use all available CPU cores and let the underlying C library manage the thread pool.
495495
Here is an example of dealing with large sets of binary vectors:
496496

497497
```py
@@ -507,9 +507,42 @@ distances = simsimd.cdist(matrix1, matrix2,
507507
)
508508
```
509509

510+
Alternatively, when using free-threading Python 3.13t builds, one can combine single-threaded SimSIMD operations with Python's `concurrent.futures.ThreadPoolExecutor` to parallelize the computations.
510511
By default, the output distances will be stored in double-precision `float64` floating-point numbers.
511512
That behavior may not be space-efficient, especially if you are computing the hamming distance between short binary vectors, that will generally fit into 8x smaller `uint8` or `uint16` types.
512-
To override this behavior, use the `dtype` argument.
513+
To override this behavior, use the `out_dtype` argument, or consider pre-allocating the output array and passing it to the `out` argument.
514+
A more complete example may look like this:
515+
516+
```py
517+
from multiprocessing import cpu_count
518+
from concurrent.futures import ThreadPoolExecutor
519+
from simsimd import cosine
520+
import numpy as np
521+
522+
# Generate large dataset
523+
vectors_a = np.random.rand(100_000, 1536).astype(np.float32)
524+
vectors_b = np.random.rand(100_000, 1536).astype(np.float32)
525+
distances = np.zeros((100_000,), dtype=np.float32)
526+
527+
def compute_batch(start_idx, end_idx):
528+
batch_a = vectors_a[start_idx:end_idx]
529+
batch_b = vectors_b[start_idx:end_idx]
530+
cosine(batch_a, batch_b, out=distances[start_idx:end_idx])
531+
532+
# Use all CPU cores with true parallelism (no GIL!)
533+
num_threads = cpu_count()
534+
chunk_size = len(vectors_a) // num_threads
535+
536+
with ThreadPoolExecutor(max_workers=num_threads) as executor:
537+
futures = []
538+
for i in range(num_threads):
539+
start_idx = i * chunk_size
540+
end_idx = (i + 1) * chunk_size if i < num_threads - 1 else len(vectors_a)
541+
futures.append(executor.submit(compute_batch, start_idx, end_idx))
542+
543+
# Collect results from all threads
544+
results = [future.result() for future in futures]
545+
```
513546

514547
### Helper Functions
515548

scripts/test.py

Lines changed: 24 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
Module: test.py
55
66
This module contains a suite of tests for the `simsimd` package.
7-
It compares various SIMD kernels (like Dot-products, squared Euclidean, and Cosine distances)
8-
with their NumPy or baseline counterparts, testing accuracy for different data types including
7+
It compares various SIMD kernels (like Dot-products, squared Euclidean, and Cosine distances)
8+
with their NumPy or baseline counterparts, testing accuracy for different data types including
99
floating-point, integer, and complex numbers.
1010
1111
The tests cover:
@@ -1421,31 +1421,36 @@ def test_cdist_hamming(ndim, out_dtype, capability):
14211421
def test_gil_free_threading():
14221422
"""Test SimSIMD in Python 3.13t free-threaded mode if available."""
14231423
import sys
1424-
1424+
import sysconfig
1425+
14251426
# Check if we're in a GIL-free environment
1427+
# https://py-free-threading.github.io/running-gil-disabled/
14261428
version = sys.version_info
14271429
if version.major == 3 and version.minor >= 13:
1428-
if sys._is_gil_enabled():
1429-
pytest.skip("GIL is enabled, skipping GIL-free threading test")
1430+
is_free_threaded = bool(sysconfig.get_config_var("Py_GIL_DISABLED"))
1431+
if not is_free_threaded:
1432+
pytest.skip("Uses non-free-threaded Python, skipping GIL-related tests")
1433+
if sys._is_gil_enabled():
1434+
pytest.skip("GIL is enabled, skipping GIL-related tests")
14301435
else:
1431-
pytest.skip("Python < 3.13t, skipping GIL-free threading test")
1432-
1436+
pytest.skip("Python < 3.13t, skipping GIL-related tests")
1437+
14331438
import multiprocessing
14341439
import concurrent.futures
14351440

14361441
num_threads = multiprocessing.cpu_count()
14371442
vectors_a = np.random.rand(32 * 1024 * num_threads, 1024).astype(np.float32)
14381443
vectors_b = np.random.rand(32 * 1024 * num_threads, 1024).astype(np.float32)
14391444
distances = np.zeros(vectors_a.shape[0], dtype=np.float32)
1440-
1445+
14411446
def compute_batch(start_idx, end_idx) -> float:
14421447
"""Compute cosine distances for a batch."""
14431448
slice_a = vectors_a[start_idx:end_idx]
14441449
slice_b = vectors_b[start_idx:end_idx]
14451450
slice_distances = distances[start_idx:end_idx]
1446-
simd.cosine(slice_a, slice_b, out=slice_distances)
1451+
simd.cosine(slice_a, slice_b, out=slice_distances)
14471452
return sum(slice_distances)
1448-
1453+
14491454
def compute_with_threads(threads: int) -> float:
14501455
"""Compute cosine distances using multiple threads."""
14511456
chunk_size = len(vectors_a) // threads
@@ -1455,36 +1460,37 @@ def compute_with_threads(threads: int) -> float:
14551460
start_idx = i * chunk_size
14561461
end_idx = (i + 1) * chunk_size if i < threads - 1 else len(vectors_a)
14571462
futures.append(executor.submit(compute_batch, start_idx, end_idx))
1458-
1463+
14591464
total_sum = 0.0
14601465
for future in concurrent.futures.as_completed(futures):
14611466
total_sum += future.result()
1462-
1467+
14631468
return total_sum
1464-
1469+
14651470
# Dual-threaded baseline is better than single-threaded,
14661471
# as it will include the overhead of thread management.
14671472
start_time = time.time()
14681473
baseline_sum = compute_with_threads(2)
14691474
end_time = time.time()
14701475
baseline_duration = end_time - start_time
1471-
1476+
14721477
# Multi-threaded execution, using all available threads
14731478
start_time = time.time()
14741479
multi_sum = compute_with_threads(num_threads)
14751480
end_time = time.time()
14761481
multi_duration = end_time - start_time
14771482

14781483
# Verify results are the same length and reasonable
1479-
assert np.allclose(baseline_sum, multi_sum, atol=SIMSIMD_ATOL, rtol=SIMSIMD_RTOL), \
1480-
f"Results differ: baseline {baseline_sum} vs multi-threaded {multi_sum}"
1484+
assert np.allclose(
1485+
baseline_sum, multi_sum, atol=SIMSIMD_ATOL, rtol=SIMSIMD_RTOL
1486+
), f"Results differ: baseline {baseline_sum} vs multi-threaded {multi_sum}"
14811487

1482-
# Warn if multi-threaded execution is slower than the baseline
1488+
# Warn if multi-threaded execution is slower than the baseline
14831489
if baseline_duration < multi_duration:
14841490
pytest.warns(
14851491
UserWarning,
14861492
f"{num_threads}-threaded execution took longer than 2-threaded baseline: {multi_duration:.2f}s vs {baseline_duration:.2f}s",
1487-
)
1493+
)
14881494

14891495

14901496
if __name__ == "__main__":

0 commit comments

Comments
 (0)