Skip to content

Commit bf5a7d2

Browse files
committed
Add: Half-precision converters for C/Rust
1 parent c6c0698 commit bf5a7d2

File tree

4 files changed

+317
-61
lines changed

4 files changed

+317
-61
lines changed

README.md

Lines changed: 109 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -454,7 +454,7 @@ import numpy as np
454454
from simsimd import fma, wsum
455455

456456
# Let's take two FullHD video frames
457-
first_frame = np.random.randn(1920 * 1024).astype(np.uint8)
457+
first_frame = np.random.randn(1920 * 1024).astype(np.uint8)
458458
second_frame = np.random.randn(1920 * 1024).astype(np.uint8)
459459
average_frame = np.empty_like(first_frame)
460460
wsum(first_frame, second_frame, alpha=0.5, beta=0.5, out=average_frame)
@@ -479,7 +479,7 @@ alpha = 0.7 # Weight for the diffuse component
479479
beta = 0.3 # Weight for the specular component
480480

481481
# Formula: color = alpha * light_intensity * diffuse_component + beta * specular_component
482-
fma(light_intensity, diffuse_component, specular_component,
482+
fma(light_intensity, diffuse_component, specular_component,
483483
dtype="float16", # Optional, unless it can't be inferred from the input
484484
alpha=alpha, beta=beta, out=output_color)
485485

@@ -499,7 +499,7 @@ ndim = 1536 # OpenAI Ada embeddings
499499
matrix1 = np.packbits(np.random.randint(2, size=(10_000, ndim)).astype(np.uint8))
500500
matrix2 = np.packbits(np.random.randint(2, size=(1_000, ndim)).astype(np.uint8))
501501

502-
distances = simsimd.cdist(matrix1, matrix2,
502+
distances = simsimd.cdist(matrix1, matrix2,
503503
metric="hamming", # Unlike SciPy, SimSIMD doesn't divide by the number of dimensions
504504
out_dtype="uint8", # so we can use `uint8` instead of `float64` to save memory.
505505
threads=0, # Use all CPU cores with OpenMP.
@@ -541,8 +541,38 @@ with ThreadPoolExecutor(max_workers=num_threads) as executor:
541541
futures.append(executor.submit(compute_batch, start_idx, end_idx))
542542

543543
# Collect results from all threads
544-
results = [future.result() for future in futures]
545-
```
544+
results = [future.result() for future in futures]
545+
```
546+
547+
### Half-Precision Brain-Float Numbers
548+
549+
The "brain-float-16" is a popular machine learning format.
550+
It's broadly supported in hardware and is very machine-friendly, but software support is still lagging behind.
551+
[Unlike NumPy](https://github.com/numpy/numpy/issues/19808), you can already use `bf16` datatype in SimSIMD.
552+
Luckily, to downcast `f32` to `bf16` you only have to drop the last 16 bits:
553+
554+
```py
555+
import numpy as np
556+
import simsimd as simd
557+
558+
a = np.random.randn(ndim).astype(np.float32)
559+
b = np.random.randn(ndim).astype(np.float32)
560+
561+
# NumPy doesn't natively support brain-float, so we need a trick!
562+
# Luckily, it's very easy to reduce the representation accuracy
563+
# by simply masking the low 16-bits of our 32-bit single-precision
564+
# numbers. We can also add `0x8000` to round the numbers.
565+
a_f32rounded = ((a.view(np.uint32) + 0x8000) & 0xFFFF0000).view(np.float32)
566+
b_f32rounded = ((b.view(np.uint32) + 0x8000) & 0xFFFF0000).view(np.float32)
567+
568+
# To represent them as brain-floats, we need to drop the second half
569+
a_bf16 = np.right_shift(a_f32rounded.view(np.uint32), 16).astype(np.uint16)
570+
b_bf16 = np.right_shift(b_f32rounded.view(np.uint32), 16).astype(np.uint16)
571+
572+
# Now we can compare the results
573+
expected = np.inner(a_f32rounded, b_f32rounded)
574+
result = simd.inner(a_bf16, b_bf16, "bf16")
575+
```
546576

547577
### Helper Functions
548578

@@ -693,23 +723,48 @@ Binary similarity functions are available only for `u8` types.
693723

694724
### Half-Precision Floating-Point Numbers
695725

696-
Rust has no native support for half-precision floating-point numbers, but SimSIMD provides a `f16` type.
697-
It has no functionality - it is a `transparent` wrapper around `u16` and can be used with `half` or any other half-precision library.
726+
Rust has no native support for half-precision floating-point numbers, but SimSIMD provides a `f16` type with built-in conversion methods.
727+
The underlying `u16` representation is publicly accessible for direct bit manipulation.
698728

699729
```rust
700-
use simsimd::SpatialSimilarity;
701-
use simsimd::f16 as SimF16;
730+
use simsimd::{SpatialSimilarity, f16};
731+
732+
fn main() {
733+
// Create f16 vectors using built-in conversion methods
734+
let vector_a: Vec<f16> = vec![1.0, 2.0, 3.0].iter().map(|&x| f16::from_f32(x)).collect();
735+
let vector_b: Vec<f16> = vec![4.0, 5.0, 6.0].iter().map(|&x| f16::from_f32(x)).collect();
736+
737+
// Compute the cosine similarity
738+
let cosine_similarity = f16::cosine(&vector_a, &vector_b)
739+
.expect("Vectors must be of the same length");
740+
741+
println!("Cosine Similarity: {}", cosine_similarity);
742+
743+
// Direct bit manipulation
744+
let half = f16::from_f32(3.14159);
745+
let bits = half.0; // Access raw u16 representation
746+
let reconstructed = f16(bits);
747+
748+
// Convert back to f32
749+
let float_value = half.to_f32();
750+
}
751+
```
752+
753+
For interoperability with the `half` crate:
754+
755+
```rust
756+
use simsimd::{SpatialSimilarity, f16 as SimF16};
702757
use half::f16 as HalfF16;
703758

704759
fn main() {
705-
let vector_a: Vec<HalfF16> = ...
706-
let vector_b: Vec<HalfF16> = ...
760+
let vector_a: Vec<HalfF16> = vec![1.0, 2.0, 3.0].iter().map(|&x| HalfF16::from_f32(x)).collect();
761+
let vector_b: Vec<HalfF16> = vec![4.0, 5.0, 6.0].iter().map(|&x| HalfF16::from_f32(x)).collect();
707762

708-
let buffer_a: &[SimF16] = unsafe { std::slice::from_raw_parts(a_half.as_ptr() as *const SimF16, a_half.len()) };
709-
let buffer_b: &[SimF16] = unsafe { std::slice::from_raw_parts(b_half.as_ptr() as *const SimF16, b_half.len()) };
763+
// Safe reinterpret cast due to identical memory layout
764+
let buffer_a: &[SimF16] = unsafe { std::slice::from_raw_parts(vector_a.as_ptr() as *const SimF16, vector_a.len()) };
765+
let buffer_b: &[SimF16] = unsafe { std::slice::from_raw_parts(vector_b.as_ptr() as *const SimF16, vector_b.len()) };
710766

711-
// Compute the cosine similarity between vector_a and vector_b
712-
let cosine_similarity = SimF16::cosine(&vector_a, &vector_b)
767+
let cosine_similarity = SimF16::cosine(buffer_a, buffer_b)
713768
.expect("Vectors must be of the same length");
714769

715770
println!("Cosine Similarity: {}", cosine_similarity);
@@ -719,31 +774,41 @@ fn main() {
719774
### Half-Precision Brain-Float Numbers
720775

721776
The "brain-float-16" is a popular machine learning format.
722-
It's broadly supported in hardware and is very machine-friendly, but software support is still lagging behind.
777+
It's broadly supported in hardware and is very machine-friendly, but software support is still lagging behind.
723778
[Unlike NumPy](https://github.com/numpy/numpy/issues/19808), you can already use `bf16` datatype in SimSIMD.
724-
Luckily, to downcast `f32` to `bf16` you only have to drop the last 16 bits:
779+
SimSIMD provides a `bf16` type with built-in conversion methods and direct bit access.
725780

726-
```py
727-
import numpy as np
728-
import simsimd as simd
781+
```rust
782+
use simsimd::{SpatialSimilarity, bf16};
729783

730-
a = np.random.randn(ndim).astype(np.float32)
731-
b = np.random.randn(ndim).astype(np.float32)
784+
fn main() {
785+
// Create bf16 vectors using built-in conversion methods
786+
let vector_a: Vec<bf16> = vec![1.0, 2.0, 3.0].iter().map(|&x| bf16::from_f32(x)).collect();
787+
let vector_b: Vec<bf16> = vec![4.0, 5.0, 6.0].iter().map(|&x| bf16::from_f32(x)).collect();
732788

733-
# NumPy doesn't natively support brain-float, so we need a trick!
734-
# Luckily, it's very easy to reduce the representation accuracy
735-
# by simply masking the low 16-bits of our 32-bit single-precision
736-
# numbers. We can also add `0x8000` to round the numbers.
737-
a_f32rounded = ((a.view(np.uint32) + 0x8000) & 0xFFFF0000).view(np.float32)
738-
b_f32rounded = ((b.view(np.uint32) + 0x8000) & 0xFFFF0000).view(np.float32)
789+
// Compute the cosine similarity
790+
let cosine_similarity = bf16::cosine(&vector_a, &vector_b)
791+
.expect("Vectors must be of the same length");
792+
793+
println!("Cosine Similarity: {}", cosine_similarity);
739794

740-
# To represent them as brain-floats, we need to drop the second half
741-
a_bf16 = np.right_shift(a_f32rounded.view(np.uint32), 16).astype(np.uint16)
742-
b_bf16 = np.right_shift(b_f32rounded.view(np.uint32), 16).astype(np.uint16)
795+
// Direct bit manipulation
796+
let brain_half = bf16::from_f32(3.14159);
797+
let bits = brain_half.0; // Access raw u16 representation
798+
let reconstructed = bf16(bits);
799+
800+
// Convert back to f32
801+
let float_value = brain_half.to_f32();
743802

744-
# Now we can compare the results
745-
expected = np.inner(a_f32rounded, b_f32rounded)
746-
result = simd.inner(a_bf16, b_bf16, "bf16")
803+
// Compare precision differences
804+
let original = 3.14159_f32;
805+
let f16_roundtrip = f16::from_f32(original).to_f32();
806+
let bf16_roundtrip = bf16::from_f32(original).to_f32();
807+
808+
println!("Original: {}", original);
809+
println!("f16 roundtrip: {}", f16_roundtrip);
810+
println!("bf16 roundtrip: {}", bf16_roundtrip);
811+
}
747812
```
748813

749814
### Dynamic Dispatch in Rust
@@ -760,6 +825,7 @@ println!("uses ice: {}", capabilities::uses_ice());
760825
println!("uses genoa: {}", capabilities::uses_genoa());
761826
println!("uses sapphire: {}", capabilities::uses_sapphire());
762827
println!("uses turin: {}", capabilities::uses_turin());
828+
println!("uses sierra: {}", capabilities::uses_sierra());
763829
```
764830

765831
## Using SimSIMD in JavaScript
@@ -776,13 +842,13 @@ This will automatically happen unless you install the package with the `--ignore
776842
After you install it, you will be able to call the SimSIMD functions on various `TypedArray` variants:
777843

778844
```js
779-
const { sqeuclidean, cosine, inner, hamming, jaccard } = require('simsimd');
845+
const { sqeuclidean, cosine, inner, hamming, jaccard } = require("simsimd");
780846

781847
const vectorA = new Float32Array([1.0, 2.0, 3.0]);
782848
const vectorB = new Float32Array([4.0, 5.0, 6.0]);
783849

784850
const distance = sqeuclidean(vectorA, vectorB);
785-
console.log('Squared Euclidean Distance:', distance);
851+
console.log("Squared Euclidean Distance:", distance);
786852
```
787853

788854
Other numeric types and precision levels are supported as well.
@@ -798,8 +864,8 @@ When doing machine learning and vector search with high-dimensional vectors you
798864
You may want to project values from the $[-1, 1]$ range to the $[-127, 127]$ range and then cast them to `Int8Array`:
799865

800866
```js
801-
const quantizedVectorA = new Int8Array(vectorA.map(v => (v * 127)));
802-
const quantizedVectorB = new Int8Array(vectorB.map(v => (v * 127)));
867+
const quantizedVectorA = new Int8Array(vectorA.map((v) => v * 127));
868+
const quantizedVectorB = new Int8Array(vectorB.map((v) => v * 127));
803869
const distance = cosine(quantizedVectorA, quantizedVectorB);
804870
```
805871

@@ -808,7 +874,7 @@ You can map all positive values to `1` and all negative values and zero to `0`,
808874
After that, Hamming and Jaccard distances can be computed.
809875

810876
```js
811-
const { toBinary, hamming } = require('simsimd');
877+
const { toBinary, hamming } = require("simsimd");
812878

813879
const binaryVectorA = toBinary(vectorA);
814880
const binaryVectorB = toBinary(vectorB);
@@ -919,7 +985,7 @@ int main() {
919985
simsimd_cos_f32(f32s, f32s, 1536, &distance);
920986
simsimd_cos_f64(f64s, f64s, 1536, &distance);
921987
simsimd_cos_bf16(bf16s, bf16s, 1536, &distance);
922-
988+
923989
// Euclidean distance between two vectors
924990
simsimd_l2sq_i8(i8s, i8s, 1536, &distance);
925991
simsimd_l2sq_u8(u8s, u8s, 1536, &distance);
@@ -1036,7 +1102,7 @@ To explicitly disable half-precision support, define the following macro before
10361102
> This flag does just that and is used to produce the `simsimd.so` shared library, as well as the Python and other bindings.
10371103
10381104
For Arm: `SIMSIMD_TARGET_NEON`, `SIMSIMD_TARGET_SVE`, `SIMSIMD_TARGET_SVE2`, `SIMSIMD_TARGET_NEON_F16`, `SIMSIMD_TARGET_SVE_F16`, `SIMSIMD_TARGET_NEON_BF16`, `SIMSIMD_TARGET_SVE_BF16`.
1039-
For x86: (`SIMSIMD_TARGET_HASWELL`, `SIMSIMD_TARGET_SKYLAKE`, `SIMSIMD_TARGET_ICE`, `SIMSIMD_TARGET_GENOA`, `SIMSIMD_TARGET_SAPPHIRE`, `SIMSIMD_TARGET_TURIN`, `SIMSIMD_TARGET_SIERRA`.
1105+
For x86: (`SIMSIMD_TARGET_HASWELL`, `SIMSIMD_TARGET_SKYLAKE`, `SIMSIMD_TARGET_ICE`, `SIMSIMD_TARGET_GENOA`, `SIMSIMD_TARGET_SAPPHIRE`, `SIMSIMD_TARGET_TURIN`, `SIMSIMD_TARGET_SIERRA`.
10401106
10411107
> By default, SimSIMD automatically infers the target architecture and pre-compiles as many kernels as possible.
10421108
> In some cases, you may want to explicitly disable some of the kernels.
@@ -1064,7 +1130,7 @@ In general there are a few principles that SimSIMD follows:
10641130
10651131
Possibly, in the future:
10661132
1067-
- Best effort computation silencing `NaN` components in low-precision inputs.
1133+
- Best effort computation silencing `NaN` components in low-precision inputs.
10681134
- Detect overflows and report the distance with a "signaling" `NaN`.
10691135
10701136
Last, but not the least - don't build unless there is a demand for it.
@@ -1199,7 +1265,7 @@ SimSIMD defines `dot` and `vdot` kernels as:
11991265

12001266
Where $\bar{b_i}$ is the complex conjugate of $b_i$.
12011267
Putting that into Python code for scalar arrays:
1202-
1268+
12031269
```python
12041270
def dot(a: List[number], b: List[number]) -> number:
12051271
a_real, a_imaginary = a[0::2], a[1::2]

c/lib.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,22 @@ SIMSIMD_DYNAMIC int simsimd_uses_sierra(void) { return (simsimd_capabilities() &
243243
SIMSIMD_DYNAMIC int simsimd_uses_dynamic_dispatch(void) { return 1; }
244244
SIMSIMD_DYNAMIC int simsimd_flush_denormals(void) { return _simsimd_flush_denormals(); }
245245

246+
SIMSIMD_DYNAMIC simsimd_f32_t simsimd_f16_to_f32(simsimd_f16_t const *x_ptr) {
247+
return simsimd_f16_to_f32_implementation(x_ptr);
248+
}
249+
250+
SIMSIMD_DYNAMIC void simsimd_f32_to_f16(simsimd_f32_t x, simsimd_f16_t *result_ptr) {
251+
simsimd_f32_to_f16_implementation(x, result_ptr);
252+
}
253+
254+
SIMSIMD_DYNAMIC simsimd_f32_t simsimd_bf16_to_f32(simsimd_bf16_t const *x_ptr) {
255+
return simsimd_bf16_to_f32_implementation(x_ptr);
256+
}
257+
258+
SIMSIMD_DYNAMIC void simsimd_f32_to_bf16(simsimd_f32_t x, simsimd_bf16_t *result_ptr) {
259+
simsimd_f32_to_bf16_implementation(x, result_ptr);
260+
}
261+
246262
SIMSIMD_DYNAMIC simsimd_capability_t simsimd_capabilities(void) {
247263
//! The latency of the CPUID instruction can be over 100 cycles, so we cache the result.
248264
static simsimd_capability_t static_capabilities = simsimd_cap_any_k;

include/simsimd/types.h

Lines changed: 42 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -547,7 +547,7 @@ SIMSIMD_INTERNAL simsimd_f32_t simsimd_approximate_log(simsimd_f32_t number) {
547547
* https://gist.github.com/milhidaka/95863906fe828198f47991c813dbe233
548548
* https://github.com/OpenCyphal/libcanard/blob/636795f4bc395f56af8d2c61d3757b5e762bb9e5/canard.c#L811-L834
549549
*/
550-
SIMSIMD_INTERNAL simsimd_f32_t simsimd_f16_to_f32(simsimd_f16_t const *x_ptr) {
550+
SIMSIMD_INTERNAL simsimd_f32_t simsimd_f16_to_f32_implementation(simsimd_f16_t const *x_ptr) {
551551
unsigned short x;
552552
SIMSIMD_COPY16(&x, x_ptr);
553553
unsigned int exponent = (x & 0x7C00) >> 10;
@@ -570,7 +570,7 @@ SIMSIMD_INTERNAL simsimd_f32_t simsimd_f16_to_f32(simsimd_f16_t const *x_ptr) {
570570
* https://gist.github.com/milhidaka/95863906fe828198f47991c813dbe233
571571
* https://github.com/OpenCyphal/libcanard/blob/636795f4bc395f56af8d2c61d3757b5e762bb9e5/canard.c#L811-L834
572572
*/
573-
SIMSIMD_INTERNAL void simsimd_f32_to_f16(simsimd_f32_t x, simsimd_f16_t *result_ptr) {
573+
SIMSIMD_INTERNAL void simsimd_f32_to_f16_implementation(simsimd_f32_t x, simsimd_f16_t *result_ptr) {
574574
simsimd_f32i32_t conv;
575575
conv.f = x;
576576
unsigned int b = conv.i + 0x00001000;
@@ -589,7 +589,7 @@ SIMSIMD_INTERNAL void simsimd_f32_to_f16(simsimd_f32_t x, simsimd_f16_t *result_
589589
* https://stackoverflow.com/questions/55253233/convert-fp32-to-bfloat16-in-c/55254307#55254307
590590
* https://cloud.google.com/blog/products/ai-machine-learning/bfloat16-the-secret-to-high-performance-on-cloud-tpus
591591
*/
592-
SIMSIMD_INTERNAL simsimd_f32_t simsimd_bf16_to_f32(simsimd_bf16_t const *x_ptr) {
592+
SIMSIMD_INTERNAL simsimd_f32_t simsimd_bf16_to_f32_implementation(simsimd_bf16_t const *x_ptr) {
593593
unsigned short x;
594594
SIMSIMD_COPY16(&x, x_ptr);
595595
simsimd_f32i32_t conv;
@@ -603,7 +603,7 @@ SIMSIMD_INTERNAL simsimd_f32_t simsimd_bf16_to_f32(simsimd_bf16_t const *x_ptr)
603603
* https://stackoverflow.com/questions/55253233/convert-fp32-to-bfloat16-in-c/55254307#55254307
604604
* https://cloud.google.com/blog/products/ai-machine-learning/bfloat16-the-secret-to-high-performance-on-cloud-tpus
605605
*/
606-
SIMSIMD_INTERNAL void simsimd_f32_to_bf16(simsimd_f32_t x, simsimd_bf16_t *result_ptr) {
606+
SIMSIMD_INTERNAL void simsimd_f32_to_bf16_implementation(simsimd_f32_t x, simsimd_bf16_t *result_ptr) {
607607
simsimd_f32i32_t conv;
608608
conv.f = x;
609609
conv.i += 0x8000; // Rounding is optional
@@ -620,6 +620,44 @@ SIMSIMD_INTERNAL simsimd_u32_t simsimd_u32_ror(simsimd_u32_t x, int n) { return
620620
SIMSIMD_INTERNAL simsimd_u16_t simsimd_u16_ror(simsimd_u16_t x, int n) { return (x >> n) | (x << (16 - n)); }
621621
SIMSIMD_INTERNAL simsimd_u8_t simsimd_u8_ror(simsimd_u8_t x, int n) { return (x >> n) | (x << (8 - n)); }
622622

623+
#if SIMSIMD_DYNAMIC_DISPATCH
624+
625+
/** @copydoc simsimd_f16_to_f32_implementation */
626+
SIMSIMD_DYNAMIC simsimd_f32_t simsimd_f16_to_f32(simsimd_f16_t const *x_ptr);
627+
628+
/** @copydoc simsimd_f32_to_f16_implementation */
629+
SIMSIMD_DYNAMIC void simsimd_f32_to_f16(simsimd_f32_t x, simsimd_f16_t *result_ptr);
630+
631+
/** @copydoc simsimd_bf16_to_f32_implementation */
632+
SIMSIMD_DYNAMIC simsimd_f32_t simsimd_bf16_to_f32(simsimd_bf16_t const *x_ptr);
633+
634+
/** @copydoc simsimd_f32_to_bf16_implementation */
635+
SIMSIMD_DYNAMIC void simsimd_f32_to_bf16(simsimd_f32_t x, simsimd_bf16_t *result_ptr);
636+
637+
#else // SIMSIMD_DYNAMIC_DISPATCH
638+
639+
/** @copydoc simsimd_f16_to_f32_implementation */
640+
SIMSIMD_PUBLIC simsimd_f32_t simsimd_f16_to_f32(simsimd_f16_t const *x_ptr) {
641+
return simsimd_f16_to_f32_implementation(x_ptr);
642+
}
643+
644+
/** @copydoc simsimd_f32_to_f16_implementation */
645+
SIMSIMD_PUBLIC void simsimd_f32_to_f16(simsimd_f32_t x, simsimd_f16_t *result_ptr) {
646+
simsimd_f32_to_f16_implementation(x, result_ptr);
647+
}
648+
649+
/** @copydoc simsimd_bf16_to_f32_implementation */
650+
SIMSIMD_PUBLIC simsimd_f32_t simsimd_bf16_to_f32(simsimd_bf16_t const *x_ptr) {
651+
return simsimd_bf16_to_f32_implementation(x_ptr);
652+
}
653+
654+
/** @copydoc simsimd_f32_to_bf16_implementation */
655+
SIMSIMD_PUBLIC void simsimd_f32_to_bf16(simsimd_f32_t x, simsimd_bf16_t *result_ptr) {
656+
simsimd_f32_to_bf16_implementation(x, result_ptr);
657+
}
658+
659+
#endif // SIMSIMD_DYNAMIC_DISPATCH
660+
623661
#ifdef __cplusplus
624662
} // extern "C"
625663
#endif

0 commit comments

Comments
 (0)