Skip to content

Commit ea175e2

Browse files
committed
Implement support for Intel crc32 instruction (SSE 4.2)
This change authored by vadimskipin and submitted via: google/leveldb#309 Changes made to support iOS builds and other architectures without support for SSE 4.2. db_bench reports original crc32 speed at: crc32c : 3.610 micros/op; 1082.0 MB/s (4K per op) with this change performance has increased to: crc32c : 0.843 micros/op; 4633.6 MB/s (4K per op) ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=148694935
1 parent 95cd743 commit ea175e2

File tree

6 files changed

+186
-1
lines changed

6 files changed

+186
-1
lines changed

Makefile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -412,3 +412,9 @@ $(SHARED_OUTDIR)/%.o: %.cc
412412

413413
$(SHARED_OUTDIR)/%.o: %.c
414414
$(CC) $(CFLAGS) $(PLATFORM_SHARED_CFLAGS) -c $< -o $@
415+
416+
$(STATIC_OUTDIR)/port/port_posix_sse.o: port/port_posix_sse.cc
417+
$(CXX) $(CXXFLAGS) $(PLATFORM_SSEFLAGS) -c $< -o $@
418+
419+
$(SHARED_OUTDIR)/port/port_posix_sse.o: port/port_posix_sse.cc
420+
$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(PLATFORM_SSEFLAGS) -c $< -o $@

build_detect_platform

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ PLATFORM_SHARED_EXT="so"
6363
PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl,"
6464
PLATFORM_SHARED_CFLAGS="-fPIC"
6565
PLATFORM_SHARED_VERSIONED=true
66+
PLATFORM_SSEFLAGS=
6667

6768
MEMCMP_FLAG=
6869
if [ "$CXX" = "g++" ]; then
@@ -77,6 +78,7 @@ case "$TARGET_OS" in
7778
COMMON_FLAGS="$MEMCMP_FLAG -lpthread -DOS_LINUX -DCYGWIN"
7879
PLATFORM_LDFLAGS="-lpthread"
7980
PORT_FILE=port/port_posix.cc
81+
PORT_SSE_FILE=port/port_posix_sse.cc
8082
;;
8183
Darwin)
8284
PLATFORM=OS_MACOSX
@@ -85,55 +87,64 @@ case "$TARGET_OS" in
8587
[ -z "$INSTALL_PATH" ] && INSTALL_PATH=`pwd`
8688
PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name $INSTALL_PATH/"
8789
PORT_FILE=port/port_posix.cc
90+
PORT_SSE_FILE=port/port_posix_sse.cc
8891
;;
8992
Linux)
9093
PLATFORM=OS_LINUX
9194
COMMON_FLAGS="$MEMCMP_FLAG -pthread -DOS_LINUX"
9295
PLATFORM_LDFLAGS="-pthread"
9396
PORT_FILE=port/port_posix.cc
97+
PORT_SSE_FILE=port/port_posix_sse.cc
9498
;;
9599
SunOS)
96100
PLATFORM=OS_SOLARIS
97101
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_SOLARIS"
98102
PLATFORM_LIBS="-lpthread -lrt"
99103
PORT_FILE=port/port_posix.cc
104+
PORT_SSE_FILE=port/port_posix_sse.cc
100105
;;
101106
FreeBSD)
102107
PLATFORM=OS_FREEBSD
103108
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_FREEBSD"
104109
PLATFORM_LIBS="-lpthread"
105110
PORT_FILE=port/port_posix.cc
111+
PORT_SSE_FILE=port/port_posix_sse.cc
106112
;;
107113
NetBSD)
108114
PLATFORM=OS_NETBSD
109115
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_NETBSD"
110116
PLATFORM_LIBS="-lpthread -lgcc_s"
111117
PORT_FILE=port/port_posix.cc
118+
PORT_SSE_FILE=port/port_posix_sse.cc
112119
;;
113120
OpenBSD)
114121
PLATFORM=OS_OPENBSD
115122
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_OPENBSD"
116123
PLATFORM_LDFLAGS="-pthread"
117124
PORT_FILE=port/port_posix.cc
125+
PORT_SSE_FILE=port/port_posix_sse.cc
118126
;;
119127
DragonFly)
120128
PLATFORM=OS_DRAGONFLYBSD
121129
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_DRAGONFLYBSD"
122130
PLATFORM_LIBS="-lpthread"
123131
PORT_FILE=port/port_posix.cc
132+
PORT_SSE_FILE=port/port_posix_sse.cc
124133
;;
125134
OS_ANDROID_CROSSCOMPILE)
126135
PLATFORM=OS_ANDROID
127136
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX"
128137
PLATFORM_LDFLAGS="" # All pthread features are in the Android C library
129138
PORT_FILE=port/port_posix.cc
139+
PORT_SSE_FILE=port/port_posix_sse.cc
130140
CROSS_COMPILE=true
131141
;;
132142
HP-UX)
133143
PLATFORM=OS_HPUX
134144
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_HPUX"
135145
PLATFORM_LDFLAGS="-pthread"
136146
PORT_FILE=port/port_posix.cc
147+
PORT_SSE_FILE=port/port_posix_sse.cc
137148
# man ld: +h internal_name
138149
PLATFORM_SHARED_LDFLAGS="-shared -Wl,+h -Wl,"
139150
;;
@@ -142,6 +153,7 @@ case "$TARGET_OS" in
142153
COMMON_FLAGS="$MEMCMP_FLAG -DOS_MACOSX"
143154
[ -z "$INSTALL_PATH" ] && INSTALL_PATH=`pwd`
144155
PORT_FILE=port/port_posix.cc
156+
PORT_SSE_FILE=port/port_posix_sse.cc
145157
PLATFORM_SHARED_EXT=
146158
PLATFORM_SHARED_LDFLAGS=
147159
PLATFORM_SHARED_CFLAGS=
@@ -168,7 +180,7 @@ set +f # re-enable globbing
168180

169181
# The sources consist of the portable files, plus the platform-specific port
170182
# file.
171-
echo "SOURCES=$PORTABLE_FILES $PORT_FILE" >> $OUTPUT
183+
echo "SOURCES=$PORTABLE_FILES $PORT_FILE $PORT_SSE_FILE" >> $OUTPUT
172184
echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> $OUTPUT
173185

174186
if [ "$CROSS_COMPILE" = "true" ]; then
@@ -210,6 +222,21 @@ EOF
210222
fi
211223

212224
rm -f $CXXOUTPUT 2>/dev/null
225+
226+
# Test if gcc SSE 4.2 is supported
227+
$CXX $CXXFLAGS -x c++ - -o $CXXOUTPUT -msse4.2 2>/dev/null <<EOF
228+
int main() {}
229+
EOF
230+
if [ "$?" = 0 ]; then
231+
PLATFORM_SSEFLAGS="-msse4.2"
232+
fi
233+
234+
rm -f $CXXOUTPUT 2>/dev/null
235+
fi
236+
237+
# Use the SSE 4.2 CRC32C intrinsics iff runtime checks indicate compiler supports them.
238+
if [ -n "$PLATFORM_SSEFLAGS" ]; then
239+
PLATFORM_SSEFLAGS="$PLATFORM_SSEFLAGS -DLEVELDB_PLATFORM_POSIX_SSE"
213240
fi
214241

215242
PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
@@ -222,6 +249,7 @@ echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> $OUTPUT
222249
echo "PLATFORM_LIBS=$PLATFORM_LIBS" >> $OUTPUT
223250
echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> $OUTPUT
224251
echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> $OUTPUT
252+
echo "PLATFORM_SSEFLAGS=$PLATFORM_SSEFLAGS" >> $OUTPUT
225253
echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> $OUTPUT
226254
echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> $OUTPUT
227255
echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> $OUTPUT

port/port_example.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,12 @@ extern bool Snappy_Uncompress(const char* input_data, size_t input_length,
129129
// The concatenation of all "data[0,n-1]" fragments is the heap profile.
130130
extern bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg);
131131

132+
// Extend the CRC to include the first n bytes of buf.
133+
//
134+
// Returns zero if the CRC cannot be extended using acceleration, else returns
135+
// the newly extended CRC value (which may also be zero).
136+
uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size);
137+
132138
} // namespace port
133139
} // namespace leveldb
134140

port/port_posix.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,8 @@ inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) {
148148
return false;
149149
}
150150

151+
uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size);
152+
151153
} // namespace port
152154
} // namespace leveldb
153155

port/port_posix_sse.cc

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
// Copyright 2016 The LevelDB Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style license that can be
3+
// found in the LICENSE file. See the AUTHORS file for names of contributors.
4+
//
5+
// A portable implementation of crc32c, optimized to handle
6+
// four bytes at a time.
7+
//
8+
// In a separate source file to allow this accelerated CRC32C function to be
9+
// compiled with the appropriate compiler flags to enable x86 SSE 4.2
10+
// instructions.
11+
12+
#include <stdint.h>
13+
#include <string.h>
14+
#include "port/port.h"
15+
16+
#if defined(LEVELDB_PLATFORM_POSIX_SSE)
17+
18+
#if defined(_MSC_VER)
19+
#include <intrin.h>
20+
#elif defined(__GNUC__) && defined(__SSE4_2__)
21+
#include <nmmintrin.h>
22+
#include <cpuid.h>
23+
#endif
24+
25+
#endif // defined(LEVELDB_PLATFORM_POSIX_SSE)
26+
27+
namespace leveldb {
28+
namespace port {
29+
30+
#if defined(LEVELDB_PLATFORM_POSIX_SSE)
31+
32+
// Used to fetch a naturally-aligned 32-bit word in little endian byte-order
33+
static inline uint32_t LE_LOAD32(const uint8_t *p) {
34+
// SSE is x86 only, so ensured that |p| is always little-endian.
35+
uint32_t word;
36+
memcpy(&word, p, sizeof(word));
37+
return word;
38+
}
39+
40+
// Used to fetch a naturally-aligned 64-bit word in little endian byte-order
41+
static inline uint64_t LE_LOAD64(const uint8_t *p) {
42+
uint64_t dword;
43+
memcpy(&dword, p, sizeof(dword));
44+
return dword;
45+
}
46+
47+
static inline bool HaveSSE42() {
48+
#if defined(_MSC_VER)
49+
int cpu_info[4];
50+
__cpuid(cpu_info, 1);
51+
return (cpu_info[2] & (1 << 20)) != 0;
52+
#elif defined(__GNUC__)
53+
unsigned int eax, ebx, ecx, edx;
54+
__get_cpuid(1, &eax, &ebx, &ecx, &edx);
55+
return (ecx & (1 << 20)) != 0;
56+
#else
57+
return false;
58+
#endif
59+
}
60+
61+
#endif // defined(LEVELDB_PLATFORM_POSIX_SSE)
62+
63+
// For further improvements see Intel publication at:
64+
// http://download.intel.com/design/intarch/papers/323405.pdf
65+
uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size) {
66+
#if !defined(LEVELDB_PLATFORM_POSIX_SSE)
67+
return 0;
68+
#else
69+
static bool have = HaveSSE42();
70+
if (!have) {
71+
return 0;
72+
}
73+
74+
const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
75+
const uint8_t *e = p + size;
76+
uint32_t l = crc ^ 0xffffffffu;
77+
78+
#define STEP1 do { \
79+
l = _mm_crc32_u8(l, *p++); \
80+
} while (0)
81+
#define STEP4 do { \
82+
l = _mm_crc32_u32(l, LE_LOAD32(p)); \
83+
p += 4; \
84+
} while (0)
85+
#define STEP8 do { \
86+
l = _mm_crc32_u64(l, LE_LOAD64(p)); \
87+
p += 8; \
88+
} while (0)
89+
90+
if (size > 16) {
91+
// Process unaligned bytes
92+
for (unsigned int i = reinterpret_cast<uintptr_t>(p) % 8; i; --i) {
93+
STEP1;
94+
}
95+
96+
// _mm_crc32_u64 is only available on x64.
97+
#if defined(_M_X64) || defined(__x86_64__)
98+
// Process 8 bytes at a time
99+
while ((e-p) >= 8) {
100+
STEP8;
101+
}
102+
// Process 4 bytes at a time
103+
if ((e-p) >= 4) {
104+
STEP4;
105+
}
106+
#else // !(defined(_M_X64) || defined(__x86_64__))
107+
// Process 4 bytes at a time
108+
while ((e-p) >= 4) {
109+
STEP4;
110+
}
111+
#endif // defined(_M_X64) || defined(__x86_64__)
112+
}
113+
// Process the last few bytes
114+
while (p != e) {
115+
STEP1;
116+
}
117+
#undef STEP8
118+
#undef STEP4
119+
#undef STEP1
120+
return l ^ 0xffffffffu;
121+
#endif // defined(LEVELDB_PLATFORM_POSIX_SSE)
122+
}
123+
124+
} // namespace port
125+
} // namespace leveldb

util/crc32c.cc

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
#include "util/crc32c.h"
99

1010
#include <stdint.h>
11+
12+
#include "port/port.h"
1113
#include "util/coding.h"
1214

1315
namespace leveldb {
@@ -283,7 +285,23 @@ static inline uint32_t LE_LOAD32(const uint8_t *p) {
283285
return DecodeFixed32(reinterpret_cast<const char*>(p));
284286
}
285287

288+
// Determine if the CPU running this program can accelerate the CRC32C
289+
// calculation.
290+
static bool CanAccelerateCRC32C() {
291+
// port::AcceleretedCRC32C returns zero when unable to accelerate.
292+
static const char kTestCRCBuffer[] = "TestCRCBuffer";
293+
static const char kBufSize = sizeof(kTestCRCBuffer) - 1;
294+
static const uint32_t kTestCRCValue = 0xdcbc59fa;
295+
296+
return port::AcceleratedCRC32C(0, kTestCRCBuffer, kBufSize) == kTestCRCValue;
297+
}
298+
286299
uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
300+
static bool accelerate = CanAccelerateCRC32C();
301+
if (accelerate) {
302+
return port::AcceleratedCRC32C(crc, buf, size);
303+
}
304+
287305
const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
288306
const uint8_t *e = p + size;
289307
uint32_t l = crc ^ 0xffffffffu;

0 commit comments

Comments
 (0)