Skip to content

Commit b3d9d58

Browse files
authored
Merge pull request #267 from ashvardanian/main-dev
Supporting R-profile Arm CPUs
2 parents b07cc52 + 4116f8a commit b3d9d58

File tree

10 files changed

+46
-38
lines changed

10 files changed

+46
-38
lines changed

.vscode/settings.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
"bfloat",
77
"bitalg",
88
"BLAS",
9+
"BLIS",
910
"Carmack",
1011
"castsi",
1112
"CBLAS",

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ if (SIMSIMD_BUILD_BENCHMARKS)
7575
FetchContent_Declare(
7676
benchmark
7777
GIT_REPOSITORY https://github.com/google/benchmark.git
78-
GIT_TAG v1.7.0
78+
GIT_TAG v1.9.4
7979
)
8080
FetchContent_MakeAvailable(benchmark)
8181

CONTRIBUTING.md

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ Replacing the default compiler across the entire system is not recommended on Ma
6868

6969
```sh
7070
brew install llvm openblas
71+
unset DEVELOPER_DIR
7172
cmake -D CMAKE_BUILD_TYPE=Release \
7273
-D SIMSIMD_BUILD_TESTS=1 \
7374
-D SIMSIMD_BUILD_BENCHMARKS=1 \
@@ -86,10 +87,10 @@ cmake --build build_release --config Release
8687
When benchmarking, make sure to disable multi-threading in the BLAS library, as it may interfere with the results:
8788

8889
```sh
89-
export OPENBLAS_NUM_THREADS=1 # for OpenBLAS
90-
export MKL_NUM_THREADS=1 # for Intel MKL
91-
export VECLIB_MAXIMUM_THREADS=1 # for Apple Accelerate
92-
export BLIS_NUM_THREADS=1 # for BLIS
90+
export OPENBLAS_NUM_THREADS=1 # for OpenBLAS
91+
export MKL_NUM_THREADS=1 # for Intel MKL
92+
export VECLIB_MAXIMUM_THREADS=1 # for Apple Accelerate
93+
export BLIS_NUM_THREADS=1 # for BLIS
9394
```
9495

9596
## Python

README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -829,12 +829,13 @@ int main() {
829829

830830
simsimd_f32_t vector_a[1536];
831831
simsimd_f32_t vector_b[1536];
832-
simsimd_kernel_punned_t distance_function = simsimd_metric_punned(
832+
simsimd_kernel_punned_t metric_punned = simsimd_metric_punned(
833833
simsimd_metric_cos_k, // Metric kind, like the angular cosine distance
834834
simsimd_datatype_f32_k, // Data type, like: f16, f32, f64, i8, b8, and complex variants
835835
simsimd_cap_any_k); // Which CPU capabilities are we allowed to use
836836
simsimd_distance_t distance;
837-
distance_function(vector_a, vector_b, 1536, &distance);
837+
simsimd_metric_dense_punned_t metric = (simsimd_metric_dense_punned_t)metric_punned;
838+
metric(vector_a, vector_b, 1536, &distance);
838839
return 0;
839840
}
840841
```

include/simsimd/binary.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,8 +107,8 @@ SIMSIMD_PUBLIC void simsimd_jaccard_b8_serial(simsimd_b8_t const *a, simsimd_b8_
107107
#if _SIMSIMD_TARGET_ARM
108108
#if SIMSIMD_TARGET_NEON
109109
#pragma GCC push_options
110-
#pragma GCC target("arch=armv8.2-a+simd")
111-
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
110+
#pragma GCC target("arch=armv8-a+simd")
111+
#pragma clang attribute push(__attribute__((target("arch=armv8-a+simd"))), apply_to = function)
112112

113113
SIMSIMD_INTERNAL simsimd_u32_t _simsimd_reduce_u8x16_neon(uint8x16_t vec) {
114114
// Split the vector into two halves and widen to `uint16x8_t`

include/simsimd/dot.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -244,8 +244,8 @@ SIMSIMD_MAKE_COMPLEX_VDOT(accurate, bf16c, f64, SIMSIMD_BF16_TO_F32) // simsimd_
244244
#if _SIMSIMD_TARGET_ARM
245245
#if SIMSIMD_TARGET_NEON
246246
#pragma GCC push_options
247-
#pragma GCC target("arch=armv8.2-a+simd")
248-
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
247+
#pragma GCC target("arch=armv8-a+simd")
248+
#pragma clang attribute push(__attribute__((target("arch=armv8-a+simd"))), apply_to = function)
249249

250250
SIMSIMD_INTERNAL float32x4_t _simsimd_partial_load_f32x4_neon(simsimd_f32_t const *x, simsimd_size_t n) {
251251
union {

include/simsimd/simsimd.h

Lines changed: 23 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -529,7 +529,11 @@ SIMSIMD_PUBLIC simsimd_capability_t _simsimd_capabilities_arm(void) {
529529
// AdvSIMD, bits [23:20] of ID_AA64PFR0_EL1 can be used to check for `fp16` support
530530
// - 0b0000: integers, single, double precision arithmetic
531531
// - 0b0001: includes support for half-precision floating-point arithmetic
532-
unsigned supports_fp16 = ((id_aa64pfr0_el1 >> 20) & 0xF) == 1;
532+
// - 0b1111: NEON is not supported?!
533+
// That's a really weird way to encode lack of NEON support, but it's important to
534+
// check in case we are running on R-profile CPUs.
535+
unsigned supports_fp16 = ((id_aa64pfr0_el1 >> 20) & 0xF) == 0x1;
536+
unsigned supports_neon = ((id_aa64pfr0_el1 >> 20) & 0xF) != 0xF;
533537

534538
// Now let's unpack the status flags from ID_AA64ZFR0_EL1
535539
// https://developer.arm.com/documentation/ddi0601/2024-03/AArch64-Registers/ID-AA64ZFR0-EL1--SVE-Feature-ID-Register-0?lang=en
@@ -545,7 +549,6 @@ SIMSIMD_PUBLIC simsimd_capability_t _simsimd_capabilities_arm(void) {
545549
// This value must match the existing indicator obtained from ID_AA64PFR0_EL1:
546550
unsigned supports_sve2 = ((id_aa64zfr0_el1) & 0xF) >= 1;
547551
unsigned supports_sve2p1 = ((id_aa64zfr0_el1) & 0xF) >= 2;
548-
unsigned supports_neon = 1; // NEON is always supported
549552

550553
return (simsimd_capability_t)( //
551554
(simsimd_cap_neon_k * (supports_neon)) | //
@@ -1575,7 +1578,7 @@ SIMSIMD_PUBLIC void simsimd_find_kernel_punned( //
15751578
*/
15761579
SIMSIMD_PUBLIC void simsimd_dot_i8(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
15771580
simsimd_distance_t *d) {
1578-
#if SIMSIMD_TARGET_NEON_F16
1581+
#if SIMSIMD_TARGET_NEON_I8
15791582
simsimd_dot_i8_neon(a, b, n, d);
15801583
#elif SIMSIMD_TARGET_ICE
15811584
simsimd_dot_i8_ice(a, b, n, d);
@@ -1699,7 +1702,7 @@ SIMSIMD_PUBLIC void simsimd_vdot_f16c(simsimd_f16c_t const *a, simsimd_f16c_t co
16991702
simsimd_distance_t *d) {
17001703
#if SIMSIMD_TARGET_SVE
17011704
simsimd_vdot_f16c_sve(a, b, n, d);
1702-
#elif SIMSIMD_TARGET_NEON
1705+
#elif SIMSIMD_TARGET_NEON_F16
17031706
simsimd_dot_f16c_neon(a, b, n, d);
17041707
#elif SIMSIMD_TARGET_SAPPHIRE
17051708
simsimd_dot_f16c_sapphire(a, b, n, d);
@@ -1759,7 +1762,7 @@ SIMSIMD_PUBLIC void simsimd_vdot_f64c(simsimd_f64c_t const *a, simsimd_f64c_t co
17591762
*/
17601763
SIMSIMD_PUBLIC void simsimd_cos_i8(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
17611764
simsimd_distance_t *d) {
1762-
#if SIMSIMD_TARGET_NEON
1765+
#if SIMSIMD_TARGET_NEON_I8
17631766
simsimd_cos_i8_neon(a, b, n, d);
17641767
#elif SIMSIMD_TARGET_ICE
17651768
simsimd_cos_i8_ice(a, b, n, d);
@@ -1771,7 +1774,7 @@ SIMSIMD_PUBLIC void simsimd_cos_i8(simsimd_i8_t const *a, simsimd_i8_t const *b,
17711774
}
17721775
SIMSIMD_PUBLIC void simsimd_cos_u8(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
17731776
simsimd_distance_t *d) {
1774-
#if SIMSIMD_TARGET_NEON
1777+
#if SIMSIMD_TARGET_NEON_I8
17751778
simsimd_cos_u8_neon(a, b, n, d);
17761779
#elif SIMSIMD_TARGET_ICE
17771780
simsimd_cos_u8_ice(a, b, n, d);
@@ -1837,7 +1840,7 @@ SIMSIMD_PUBLIC void simsimd_cos_f64(simsimd_f64_t const *a, simsimd_f64_t const
18371840
}
18381841
SIMSIMD_PUBLIC void simsimd_l2sq_i8(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
18391842
simsimd_distance_t *d) {
1840-
#if SIMSIMD_TARGET_NEON
1843+
#if SIMSIMD_TARGET_NEON_I8
18411844
simsimd_l2sq_i8_neon(a, b, n, d);
18421845
#elif SIMSIMD_TARGET_ICE
18431846
simsimd_l2sq_i8_ice(a, b, n, d);
@@ -1849,7 +1852,7 @@ SIMSIMD_PUBLIC void simsimd_l2sq_i8(simsimd_i8_t const *a, simsimd_i8_t const *b
18491852
}
18501853
SIMSIMD_PUBLIC void simsimd_l2sq_u8(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
18511854
simsimd_distance_t *d) {
1852-
#if SIMSIMD_TARGET_NEON
1855+
#if SIMSIMD_TARGET_NEON_I8
18531856
simsimd_l2sq_u8_neon(a, b, n, d);
18541857
#elif SIMSIMD_TARGET_ICE
18551858
simsimd_l2sq_u8_ice(a, b, n, d);
@@ -1915,7 +1918,7 @@ SIMSIMD_PUBLIC void simsimd_l2sq_f64(simsimd_f64_t const *a, simsimd_f64_t const
19151918
}
19161919
SIMSIMD_PUBLIC void simsimd_l2_i8(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
19171920
simsimd_distance_t *d) {
1918-
#if SIMSIMD_TARGET_NEON
1921+
#if SIMSIMD_TARGET_NEON_I8
19191922
simsimd_l2_i8_neon(a, b, n, d);
19201923
#elif SIMSIMD_TARGET_ICE
19211924
simsimd_l2_i8_ice(a, b, n, d);
@@ -1927,7 +1930,7 @@ SIMSIMD_PUBLIC void simsimd_l2_i8(simsimd_i8_t const *a, simsimd_i8_t const *b,
19271930
}
19281931
SIMSIMD_PUBLIC void simsimd_l2_u8(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
19291932
simsimd_distance_t *d) {
1930-
#if SIMSIMD_TARGET_NEON
1933+
#if SIMSIMD_TARGET_NEON_I8
19311934
simsimd_l2_u8_neon(a, b, n, d);
19321935
#elif SIMSIMD_TARGET_ICE
19331936
simsimd_l2_u8_ice(a, b, n, d);
@@ -2050,7 +2053,7 @@ SIMSIMD_PUBLIC void simsimd_jaccard_b8(simsimd_b8_t const *a, simsimd_b8_t const
20502053
*/
20512054
SIMSIMD_PUBLIC void simsimd_kl_f16(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
20522055
simsimd_distance_t *d) {
2053-
#if SIMSIMD_TARGET_NEON
2056+
#if SIMSIMD_TARGET_NEON_F16
20542057
simsimd_kl_f16_neon(a, b, n, d);
20552058
#elif SIMSIMD_TARGET_HASWELL
20562059
simsimd_kl_f16_haswell(a, b, n, d);
@@ -2078,7 +2081,7 @@ SIMSIMD_PUBLIC void simsimd_kl_f64(simsimd_f64_t const *a, simsimd_f64_t const *
20782081
}
20792082
SIMSIMD_PUBLIC void simsimd_js_f16(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
20802083
simsimd_distance_t *d) {
2081-
#if SIMSIMD_TARGET_NEON
2084+
#if SIMSIMD_TARGET_NEON_F16
20822085
simsimd_js_f16_neon(a, b, n, d);
20832086
#elif SIMSIMD_TARGET_HASWELL
20842087
simsimd_js_f16_haswell(a, b, n, d);
@@ -2209,7 +2212,7 @@ SIMSIMD_PUBLIC void simsimd_bilinear_f16(simsimd_f16_t const *a, simsimd_f16_t c
22092212
simsimd_bilinear_f16_sapphire(a, b, c, n, d);
22102213
#elif SIMSIMD_TARGET_HASWELL
22112214
simsimd_bilinear_f16_haswell(a, b, c, n, d);
2212-
#elif SIMSIMD_TARGET_NEON
2215+
#elif SIMSIMD_TARGET_NEON_F16
22132216
simsimd_bilinear_f16_neon(a, b, c, n, d);
22142217
#else
22152218
simsimd_bilinear_f16_serial(a, b, c, n, d);
@@ -2221,7 +2224,7 @@ SIMSIMD_PUBLIC void simsimd_bilinear_bf16(simsimd_bf16_t const *a, simsimd_bf16_
22212224
simsimd_bilinear_bf16_genoa(a, b, c, n, d);
22222225
#elif SIMSIMD_TARGET_HASWELL
22232226
simsimd_bilinear_bf16_haswell(a, b, c, n, d);
2224-
#elif SIMSIMD_TARGET_NEON
2227+
#elif SIMSIMD_TARGET_NEON_BF16
22252228
simsimd_bilinear_bf16_neon(a, b, c, n, d);
22262229
#else
22272230
simsimd_bilinear_bf16_serial(a, b, c, n, d);
@@ -2249,7 +2252,7 @@ SIMSIMD_PUBLIC void simsimd_bilinear_f16c(simsimd_f16c_t const *a, simsimd_f16c_
22492252
simsimd_size_t n, simsimd_distance_t *d) {
22502253
#if SIMSIMD_TARGET_SAPPHIRE
22512254
simsimd_bilinear_f16c_sapphire(a, b, c, n, d);
2252-
#elif SIMSIMD_TARGET_NEON
2255+
#elif SIMSIMD_TARGET_NEON_F16
22532256
simsimd_bilinear_f16c_neon(a, b, c, n, d);
22542257
#else
22552258
simsimd_bilinear_f16c_serial(a, b, c, n, d);
@@ -2259,7 +2262,7 @@ SIMSIMD_PUBLIC void simsimd_bilinear_bf16c(simsimd_bf16c_t const *a, simsimd_bf1
22592262
simsimd_size_t n, simsimd_distance_t *d) {
22602263
#if SIMSIMD_TARGET_GENOA
22612264
simsimd_bilinear_bf16c_genoa(a, b, c, n, d);
2262-
#elif SIMSIMD_TARGET_NEON
2265+
#elif SIMSIMD_TARGET_NEON_BF16
22632266
simsimd_bilinear_bf16c_neon(a, b, c, n, d);
22642267
#else
22652268
simsimd_bilinear_bf16c_serial(a, b, c, n, d);
@@ -2289,7 +2292,7 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_f16(simsimd_f16_t const *a, simsimd_f16_
22892292
simsimd_mahalanobis_f16_sapphire(a, b, c, n, d);
22902293
#elif SIMSIMD_TARGET_HASWELL
22912294
simsimd_mahalanobis_f16_haswell(a, b, c, n, d);
2292-
#elif SIMSIMD_TARGET_NEON
2295+
#elif SIMSIMD_TARGET_NEON_F16
22932296
simsimd_mahalanobis_f16_neon(a, b, c, n, d);
22942297
#else
22952298
simsimd_mahalanobis_f16_serial(a, b, c, n, d);
@@ -2301,7 +2304,7 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_bf16(simsimd_bf16_t const *a, simsimd_bf
23012304
simsimd_mahalanobis_bf16_genoa(a, b, c, n, d);
23022305
#elif SIMSIMD_TARGET_HASWELL
23032306
simsimd_mahalanobis_bf16_haswell(a, b, c, n, d);
2304-
#elif SIMSIMD_TARGET_NEON
2307+
#elif SIMSIMD_TARGET_NEON_BF16
23052308
simsimd_mahalanobis_bf16_neon(a, b, c, n, d);
23062309
#else
23072310
simsimd_mahalanobis_bf16_serial(a, b, c, n, d);
@@ -2348,7 +2351,7 @@ SIMSIMD_PUBLIC void simsimd_wsum_bf16(simsimd_bf16_t const *a, simsimd_bf16_t co
23482351
simsimd_wsum_bf16_skylake(a, b, n, alpha, beta, r);
23492352
#elif SIMSIMD_TARGET_HASWELL
23502353
simsimd_wsum_bf16_haswell(a, b, n, alpha, beta, r);
2351-
#elif SIMSIMD_TARGET_NEON
2354+
#elif SIMSIMD_TARGET_NEON_BF16
23522355
simsimd_wsum_bf16_neon(a, b, n, alpha, beta, r);
23532356
#else
23542357
simsimd_wsum_bf16_serial(a, b, n, alpha, beta, r);
@@ -2427,7 +2430,7 @@ SIMSIMD_PUBLIC void simsimd_fma_bf16(simsimd_bf16_t const *a, simsimd_bf16_t con
24272430
simsimd_fma_bf16_skylake(a, b, c, n, alpha, beta, r);
24282431
#elif SIMSIMD_TARGET_HASWELL
24292432
simsimd_fma_bf16_haswell(a, b, c, n, alpha, beta, r);
2430-
#elif SIMSIMD_TARGET_NEON
2433+
#elif SIMSIMD_TARGET_NEON_BF16
24312434
simsimd_fma_bf16_neon(a, b, c, n, alpha, beta, r);
24322435
#else
24332436
simsimd_fma_bf16_serial(a, b, c, n, alpha, beta, r);

include/simsimd/sparse.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -850,8 +850,8 @@ SIMSIMD_PUBLIC void simsimd_spdot_counts_u16_turin( //
850850
#if _SIMSIMD_TARGET_ARM
851851
#if SIMSIMD_TARGET_NEON
852852
#pragma GCC push_options
853-
#pragma GCC target("arch=armv8.2-a")
854-
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a"))), apply_to = function)
853+
#pragma GCC target("arch=armv8-a")
854+
#pragma clang attribute push(__attribute__((target("arch=armv8-a"))), apply_to = function)
855855

856856
/**
857857
* @brief Uses `vshrn` to produce a bitmask, similar to `movemask` in SSE.

include/simsimd/spatial.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -263,8 +263,8 @@ SIMSIMD_MAKE_L2(accurate, bf16, f64, SIMSIMD_BF16_TO_F32) // simsimd_l2_bf16_a
263263
#if _SIMSIMD_TARGET_ARM
264264
#if SIMSIMD_TARGET_NEON
265265
#pragma GCC push_options
266-
#pragma GCC target("arch=armv8.2-a+simd")
267-
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
266+
#pragma GCC target("arch=armv8-a+simd")
267+
#pragma clang attribute push(__attribute__((target("arch=armv8-a+simd"))), apply_to = function)
268268

269269
SIMSIMD_INTERNAL simsimd_f32_t _simsimd_sqrt_f32_neon(simsimd_f32_t x) {
270270
return vget_lane_f32(vsqrt_f32(vdup_n_f32(x)), 0);

include/simsimd/types.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,16 @@
2727
// - `SIMSIMD_INTERNAL` is used for internal helper functions with unstable APIs.
2828
// - `SIMSIMD_DYNAMIC` is used for functions that are part of the public API, but are dispatched at runtime.
2929
//
30+
// On GCC we mark the functions as `nonnull` informing that none of the arguments can be `NULL`.
31+
// Marking with `pure` and `const` isn't possible as outputing to a pointer is a "side effect".
3032
#if defined(_WIN32) || defined(__CYGWIN__)
3133
#define SIMSIMD_DYNAMIC __declspec(dllexport)
3234
#define SIMSIMD_PUBLIC inline static
3335
#define SIMSIMD_INTERNAL inline static
3436
#elif defined(__GNUC__) || defined(__clang__)
35-
#define SIMSIMD_DYNAMIC __attribute__((visibility("default")))
36-
#define SIMSIMD_PUBLIC __attribute__((unused)) inline static
37-
#define SIMSIMD_INTERNAL inline static
37+
#define SIMSIMD_DYNAMIC __attribute__((visibility("default"))) __attribute__((nonnull))
38+
#define SIMSIMD_PUBLIC __attribute__((unused, nonnull)) inline static
39+
#define SIMSIMD_INTERNAL __attribute__((always_inline)) inline static
3840
#else
3941
#define SIMSIMD_DYNAMIC
4042
#define SIMSIMD_PUBLIC inline static

0 commit comments

Comments
 (0)