From 406410dcb11468694147eb6190221bcfdc8be315 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sat, 5 Apr 2025 13:35:18 +0200 Subject: [PATCH 1/2] add support for SIMD-accelerated HMAC-BLAKE2 --- Modules/hmacmodule.c | 73 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 69 insertions(+), 4 deletions(-) diff --git a/Modules/hmacmodule.c b/Modules/hmacmodule.c index faf4e0a023147e..0b718081110ae8 100644 --- a/Modules/hmacmodule.c +++ b/Modules/hmacmodule.c @@ -20,6 +20,21 @@ #include "pycore_hashtable.h" #include "pycore_strhex.h" // _Py_strhex() +/* + * Taken from blake2module.c. In the future, detection of SIMD support + * should be delegated to https://github.com/python/cpython/pull/125011. + */ +#if defined(__x86_64__) && defined(__GNUC__) +# include +#elif defined(_M_X64) +# include +#endif + +#if defined(__APPLE__) && defined(__arm64__) +# undef HACL_CAN_COMPILE_SIMD128 +# undef HACL_CAN_COMPILE_SIMD256 +#endif + // Small mismatch between the variable names Python defines as part of configure // at the ones HACL* expects to be set in order to enable those headers. #define HACL_CAN_COMPILE_VEC128 HACL_CAN_COMPILE_SIMD128 @@ -1667,16 +1682,66 @@ hmacmodule_init_strings(hmacmodule_state *state) static void hmacmodule_init_cpu_features(hmacmodule_state *state) { + int eax1 = 0, ebx1 = 0, ecx1 = 0, edx1 = 0; + int eax7 = 0, ebx7 = 0, ecx7 = 0, edx7 = 0; +#if defined(__x86_64__) && defined(__GNUC__) + __cpuid_count(1, 0, eax1, ebx1, ecx1, edx1); + __cpuid_count(7, 0, eax7, ebx7, ecx7, edx7); +#elif defined(_M_X64) + int info1[4] = { 0 }; + __cpuidex(info1, 1, 0); + eax1 = info1[0], ebx1 = info1[1], ecx1 = info1[2], edx1 = info1[3]; + + int info7[4] = { 0 }; + __cpuidex(info7, 7, 0); + eax7 = info7[0], ebx7 = info7[1], ecx7 = info7[2], edx7 = info7[3]; +#endif + // fmt: off + (void)eax1; (void)ebx1; (void)ecx1; (void)edx1; + (void)eax7; (void)ebx7; (void)ecx7; (void)edx7; + // fmt: on + +#define EBX_AVX2 (1 << 5) +#define ECX_SSE3 (1 << 0) +#define ECX_SSSE3 (1 << 9) +#define ECX_SSE4_1 (1 << 19) +#define ECX_SSE4_2 (1 << 20) +#define ECX_AVX (1 << 28) +#define EDX_SSE (1 << 25) +#define EDX_SSE2 (1 << 26) +#define EDX_CMOV (1 << 15) + + bool avx = (ecx1 & ECX_AVX) != 0; + bool avx2 = (ebx7 & EBX_AVX2) != 0; + + bool sse = (edx1 & EDX_SSE) != 0; + bool sse2 = (edx1 & EDX_SSE2) != 0; + bool cmov = (edx1 & EDX_CMOV) != 0; + + bool sse3 = (ecx1 & ECX_SSE3) != 0; + bool sse41 = (ecx1 & ECX_SSE4_1) != 0; + bool sse42 = (ecx1 & ECX_SSE4_2) != 0; + +#undef EDX_CMOV +#undef EDX_SSE2 +#undef EDX_SSE +#undef ECX_AVX +#undef ECX_SSE4_2 +#undef ECX_SSE4_1 +#undef ECX_SSSE3 +#undef ECX_SSE3 +#undef EBX_AVX2 + #if HACL_CAN_COMPILE_SIMD128 - // TODO: use py_cpuid_features (gh-125022) to deduce what we want - state->can_run_simd128 = false; + // TODO(picnixz): use py_cpuid_features (gh-125022) to improve detection + state->can_run_simd128 = sse && sse2 && sse3 && sse41 && sse42 && cmov; #else state->can_run_simd128 = false; #endif #if HACL_CAN_COMPILE_SIMD256 - // TODO: use py_cpuid_features (gh-125022) to deduce what we want - state->can_run_simd256 = false; + // TODO(picnixz): use py_cpuid_features (gh-125022) to improve detection + state->can_run_simd256 = state->can_run_simd128 && avx && avx2; #else state->can_run_simd256 = false; #endif From 2e05bf7234f5835e622cfe48148fdac9da8a70be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sat, 5 Apr 2025 13:43:53 +0200 Subject: [PATCH 2/2] suppress unused warnings --- Modules/hmacmodule.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Modules/hmacmodule.c b/Modules/hmacmodule.c index 0b718081110ae8..f75854c6ef5c91 100644 --- a/Modules/hmacmodule.c +++ b/Modules/hmacmodule.c @@ -1736,6 +1736,9 @@ hmacmodule_init_cpu_features(hmacmodule_state *state) // TODO(picnixz): use py_cpuid_features (gh-125022) to improve detection state->can_run_simd128 = sse && sse2 && sse3 && sse41 && sse42 && cmov; #else + // fmt: off + (void)sse; (void)sse2; (void)sse3; (void)sse41; (void)sse42; (void)cmov; + // fmt: on state->can_run_simd128 = false; #endif @@ -1743,6 +1746,9 @@ hmacmodule_init_cpu_features(hmacmodule_state *state) // TODO(picnixz): use py_cpuid_features (gh-125022) to improve detection state->can_run_simd256 = state->can_run_simd128 && avx && avx2; #else + // fmt: off + (void)avx; (void)avx2; + // fmt: on state->can_run_simd256 = false; #endif }