-
Notifications
You must be signed in to change notification settings - Fork 782
Hyperloglog ARM NEON SIMD optimization #1859
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: unstable
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -47,6 +47,10 @@ | |
#include <immintrin.h> | ||
#endif | ||
|
||
#ifdef __ARM_NEON | ||
#include <arm_neon.h> | ||
#endif | ||
|
||
/* The HyperLogLog implementation is based on the following ideas: | ||
* | ||
* * The use of a 64 bit hash function as proposed in [1], in order to estimate | ||
|
@@ -227,6 +231,13 @@ static int simd_enabled = 1; | |
#define HLL_USE_AVX2 0 | ||
#endif | ||
|
||
#ifdef __ARM_NEON | ||
static int simd_enabled = 1; | ||
#define HLL_USE_NEON (simd_enabled) | ||
#else | ||
#define HLL_USE_NEON 0 | ||
#endif | ||
|
||
/* =========================== Low level bit macros ========================= */ | ||
|
||
/* Macros to access the dense representation. | ||
|
@@ -1193,6 +1204,95 @@ void hllMergeDenseAVX2(uint8_t *reg_raw, const uint8_t *reg_dense) { | |
} | ||
#endif | ||
|
||
#if defined(__ARM_NEON) | ||
/* | ||
* hllMergeDenseNEON is an ARM optimized version of hllMergeDense using NEON | ||
* | ||
* This function merges HyperLogLog (HLL) dense registers using ARM NEON SIMD instructions. | ||
* It extracts 6 bits registers from a dense format, and stores them in raw format | ||
* | ||
* Parameters: | ||
* - reg_raw: Pointer to the raw register array | ||
* - reg_dense: Pointer to the dense register array | ||
*/ | ||
void hllMergeDenseNEON(uint8_t *reg_raw, const uint8_t *reg_dense) { | ||
uint8_t *dense_ptr = (uint8_t *)reg_dense; | ||
uint8_t *raw_ptr = (uint8_t *)reg_raw; | ||
|
||
uint8x16_t idx = {0, 1, 2, 0xFF, | ||
3, 4, 5, 0xFF, | ||
6, 7, 8, 0xFF, | ||
9, 10, 11, 0xFF}; | ||
|
||
// Bit masks for extracting specific bit ranges | ||
uint8x16_t mask1 = vreinterpretq_u8_u32(vdupq_n_u32(0x0000003f)); // Bits 0-5 | ||
uint8x16_t mask2 = vreinterpretq_u8_u32(vdupq_n_u32(0x00000fc0)); // Bits 6-11 | ||
uint8x16_t mask3 = vreinterpretq_u8_u32(vdupq_n_u32(0x0003f000)); // Bits 12-17 | ||
uint8x16_t mask4 = vreinterpretq_u8_u32(vdupq_n_u32(0x00fc0000)); // Bits 18-23 | ||
|
||
for (int i = 0; i < HLL_REGISTERS / 16 - 1; ++i) { | ||
/* Load 16 bytes from dense registers but only the first 12 bytes are processed because they contain | ||
* 16 registers, which is copied into 16 bytes raw registers. | ||
* The last 4 bytes are ignored because (1) they do not form a complete number of registers, and do not fit | ||
* in the 16 bytes. The unprocessed 4 bytes are processed in the next iteration. | ||
*/ | ||
uint8x16_t r = vld1q_u8(dense_ptr); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. May crash on ARMv7-A if the address isn't 16-byte aligned. TODO |
||
|
||
/* Reorder bytes based on index mapping | ||
* Lookup indices | ||
*From: {AAAB|BBCC|CDDD} | ||
*To: {AAA0|BBB0|CCC0|DDD0} | ||
*/ | ||
uint8x16_t x = vqtbl1q_u8(r, idx); | ||
|
||
// Extract and isolate registers | ||
uint8x16_t a1 = vandq_u8(x, mask1); | ||
uint8x16_t a2 = vandq_u8(x, mask2); | ||
uint8x16_t a3 = vandq_u8(x, mask3); | ||
uint8x16_t a4 = vandq_u8(x, mask4); | ||
|
||
// Align extracted values by shifting left | ||
uint32x4_t a2_32 = vreinterpretq_u32_u8(a2); | ||
a2_32 = vshlq_n_u32(a2_32, 2); | ||
a2 = vreinterpretq_u8_u32(a2_32); | ||
|
||
uint32x4_t a3_32 = vreinterpretq_u32_u8(a3); | ||
a3_32 = vshlq_n_u32(a3_32, 4); | ||
a3 = vreinterpretq_u8_u32(a3_32); | ||
|
||
uint32x4_t a4_32 = vreinterpretq_u32_u8(a4); | ||
a4_32 = vshlq_n_u32(a4_32, 6); | ||
a4 = vreinterpretq_u8_u32(a4_32); | ||
|
||
// Combine extracted values | ||
uint8x16_t y1 = vorrq_u8(a1, a2); | ||
uint8x16_t y2 = vorrq_u8(a3, a4); | ||
uint8x16_t y = vorrq_u8(y1, y2); | ||
|
||
// Load current raw register values | ||
uint8x16_t z = vld1q_u8(raw_ptr); | ||
|
||
// Update raw registers with max values | ||
z = vmaxq_u8(z, y); | ||
|
||
// Store updated values | ||
vst1q_u8(raw_ptr, z); | ||
|
||
raw_ptr += 16; | ||
dense_ptr += 12; | ||
} | ||
|
||
/* Process remaining registers, we do this manually because we don't want to over-read 4 bytes */ | ||
uint8_t val; | ||
for (int i = HLL_REGISTERS - 16; i < HLL_REGISTERS; i++) { | ||
HLL_DENSE_GET_REGISTER(val, reg_dense, i); | ||
if (val > reg_raw[i]) { | ||
reg_raw[i] = val; // Update raw register if new value is greater | ||
} | ||
} | ||
} | ||
#endif // __ARM_NEON__ | ||
|
||
/* Merge dense-encoded registers to raw registers array. */ | ||
void hllMergeDense(uint8_t *reg_raw, const uint8_t *reg_dense) { | ||
#ifdef HAVE_AVX2 | ||
|
@@ -1203,6 +1303,14 @@ void hllMergeDense(uint8_t *reg_raw, const uint8_t *reg_dense) { | |
} | ||
} | ||
#endif | ||
#ifdef __ARM_NEON | ||
if (HLL_REGISTERS == 16384 && HLL_BITS == 6) { | ||
if (HLL_USE_NEON) { | ||
hllMergeDenseNEON(reg_raw, reg_dense); | ||
return; | ||
} | ||
} | ||
#endif | ||
|
||
uint8_t val; | ||
for (int i = 0; i < HLL_REGISTERS; i++) { | ||
|
@@ -1357,6 +1465,74 @@ void hllDenseCompressAVX2(uint8_t *reg_dense, const uint8_t *reg_raw) { | |
} | ||
#endif | ||
|
||
#if defined(__ARM_NEON) | ||
/* | ||
* hllDenseCompressNEON is ARM optimized version of hllDenseCompress using NEON. | ||
* | ||
* This function takes a raw register (`reg_raw`) and compresses it into a dense representation (`reg_dense`). | ||
* It uses NEON SIMD instructions to process multiple values at once. | ||
* | ||
* - The first loop processes most of the registers in 16-element blocks using NEON instructions. | ||
* - The second loop handles the remaining registers using a direct assignment macro. | ||
* | ||
*/ | ||
void hllDenseCompressNEON(uint8_t *reg_dense, const uint8_t *reg_raw) { | ||
/* Shuffle indices for packing bytes of dense registers | ||
* From: {AAA0|BBB0|CCC0|DDD0} | ||
* To: {AAAB|BBCC|CDDD|0000} | ||
*/ | ||
uint8x16_t idx = { | ||
0, 1, 2, // Extract bytes from lane 0 | ||
4, 5, 6, // Extract bytes from lane 1 | ||
8, 9, 10, // Extract bytes from lane 2 | ||
12, 13, 14, // Extract bytes from lane 3 | ||
0xFF, 0xFF, 0xFF, 0xFF // Zero out last 4 elements (padding) | ||
}; | ||
|
||
// Bit masks for extracting first 6 bits from every byte within 32-bit lanes | ||
uint32x4_t mask1 = vdupq_n_u32(0x0000003F); // Extract bits 0-5 | ||
uint32x4_t mask2 = vdupq_n_u32(0x00003F00); // Extract bits 8-13 | ||
uint32x4_t mask3 = vdupq_n_u32(0x003F0000); // Extract bits 16-21 | ||
uint32x4_t mask4 = vdupq_n_u32(0x3F000000); // Extract bits 24-29 | ||
|
||
uint8_t *r = (uint8_t *)reg_raw; // Input pointer | ||
uint8_t *t = (uint8_t *)reg_dense; // Output pointer | ||
|
||
// Process registers in blocks of 16 using NEON instructions | ||
// The last 16 registers are processed separately to avoid overwriting, as the final write is 12 bytes. | ||
for (int i = 0; i < HLL_REGISTERS / 16 - 1; i++) { | ||
// Load 16 bytes as 4x 32-bit values | ||
uint32x4_t x = vld1q_u32((uint32_t *)r); | ||
|
||
// Apply masks to extract a single register from every 4 registers, for every lane | ||
uint32x4_t a1 = vandq_u32(x, mask1); | ||
uint32x4_t a2 = vandq_u32(x, mask2); | ||
uint32x4_t a3 = vandq_u32(x, mask3); | ||
uint32x4_t a4 = vandq_u32(x, mask4); | ||
|
||
// Shift extracted bits to align them properly | ||
a2 = vshrq_n_u32(a2, 2); | ||
a3 = vshrq_n_u32(a3, 4); | ||
a4 = vshrq_n_u32(a4, 6); | ||
|
||
uint32x4_t y1 = vorrq_u32(a1, a2); | ||
uint32x4_t y2 = vorrq_u32(a3, a4); | ||
uint32x4_t y = vorrq_u32(y1, y2); | ||
|
||
// Perform a table lookup to shuffle extracted values and align them in 12 bytes | ||
vst1q_u8(t, vqtbl1q_u8(vreinterpretq_u8_u32(y), idx)); | ||
|
||
t += 12; | ||
r += 16; | ||
} | ||
|
||
// Handle the remaining registers individually (12 bytes) | ||
for (int i = HLL_REGISTERS - 16; i < HLL_REGISTERS; i++) { | ||
HLL_DENSE_SET_REGISTER(reg_dense, i, reg_raw[i]); | ||
} | ||
} | ||
#endif // __ARM_NEON__ | ||
|
||
/* Compress raw registers to dense representation. */ | ||
void hllDenseCompress(uint8_t *reg_dense, const uint8_t *reg_raw) { | ||
#ifdef HAVE_AVX2 | ||
|
@@ -1366,6 +1542,16 @@ void hllDenseCompress(uint8_t *reg_dense, const uint8_t *reg_raw) { | |
return; | ||
} | ||
} | ||
|
||
#endif | ||
|
||
#ifdef __ARM_NEON | ||
if (HLL_REGISTERS == 16384 && HLL_BITS == 6) { | ||
if (HLL_USE_NEON) { | ||
hllDenseCompressNEON(reg_dense, reg_raw); | ||
return; | ||
} | ||
} | ||
#endif | ||
|
||
for (int i = 0; i < HLL_REGISTERS; i++) { | ||
|
@@ -1772,16 +1958,22 @@ void pfdebugCommand(client *c) { | |
if (!strcasecmp(c->argv[2]->ptr, "on")) { | ||
#ifdef HAVE_AVX2 | ||
simd_enabled = 1; | ||
#endif | ||
#ifdef __ARM_NEON | ||
simd_enabled = 1; | ||
#endif | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
or we can consider some macro like |
||
} else if (!strcasecmp(c->argv[2]->ptr, "off")) { | ||
#ifdef HAVE_AVX2 | ||
simd_enabled = 0; | ||
#endif | ||
#ifdef __ARM_NEON | ||
simd_enabled = 0; | ||
#endif | ||
} else { | ||
addReplyError(c, "Argument must be ON or OFF"); | ||
} | ||
|
||
if (HLL_USE_AVX2) { | ||
if (HLL_USE_AVX2 || HLL_USE_NEON) { | ||
addReplyStatus(c, "enabled"); | ||
} else { | ||
addReplyStatus(c, "disabled"); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need add a runtime check here when server startup ?