@@ -18,7 +18,65 @@ extern size_t Stringrchr(char *str,char ch, size_t stride,size_t len);
18
18
extern size_t Stringrchr2 (unsigned short * str , unsigned short ch , size_t stride ,size_t len );
19
19
extern size_t Stringrchr4 (unsigned int * str , unsigned int ch , size_t stride ,size_t len );
20
20
21
- #if defined(__SSE2__ ) || EMU_AVX
21
+ #if C_AVX2 || EMU_AVX2
22
+
23
+ static size_t srchr (char * str , char ch , size_t len ){
24
+ size_t i = len ;
25
+ // align to 32 bytes
26
+ while ((i > 0 ) && ((((intptr_t )str + i ) & 31 ) != 0 )){if (ch != str [i - 1 ]) return i ; else -- i ;}
27
+ if (!i ) return 0 ;
28
+ /* don't test i>=0 which is always true because size_t is unsigned */
29
+ const __m256i xmm0 = _mm256_set1_epi8 ( ch );
30
+ const __m256i xmm2 = _mm256_set1_epi8 ( 0xff );
31
+ while (i > 32 ) {
32
+ // search for ch
33
+ int mask = 0 ;
34
+ __m256i xmm1 = _mm256_load_si256 ((__m256i * )(str + i - 32 ));
35
+ xmm1 = _mm256_andnot_si256 (_mm256_cmpeq_epi8 (xmm1 , xmm0 ),xmm2 );
36
+ if ((mask = _mm256_movemask_epi8 (xmm1 )) != 0 ) { // some character is not ch
37
+ // got 0 somewhere within 32 bytes in xmm1, or within 32 bits in mask
38
+ // find index of last set bit
39
+ #if (MMSC_VER ) // make sure <intrin.h> is included
40
+ unsigned long pos ;
41
+ _BitScanBackward (& pos , mask );
42
+ i -= (size_t )pos ;
43
+ #elif defined(__clang__ ) || ((__GNUC__ >= 4 ) || ((__GNUC__ == 3 ) && (__GNUC_MINOR__ >= 4 ))) // modern GCC has built-in __builtin_ctz
44
+ i -= __builtin_clz (mask );
45
+ #else // none of choices exist, use local BSR implementation
46
+ #error __builtin_clz
47
+ #endif
48
+ return i ;
49
+ }
50
+ i -= 32 ;
51
+ }
52
+ while (i > 16 ) {
53
+ const __m128i xmm0 = _mm_set1_epi8 ( ch );
54
+ const __m128i xmm2 = _mm_set1_epi8 ( 0xff );
55
+ // search for ch
56
+ int mask = 0 ;
57
+ __m128i xmm1 = _mm_load_si128 ((__m128i * )(str + i - 16 ));
58
+ xmm1 = _mm_andnot_si128 (_mm_cmpeq_epi8 (xmm1 , xmm0 ),xmm2 );
59
+ if ((mask = _mm_movemask_epi8 (xmm1 )) != 0 ) { // some character is not ch
60
+ // got 0 somewhere within 16 bytes in xmm1, or within 16 bits in mask
61
+ // find index of last set bit
62
+ #if (MMSC_VER ) // make sure <intrin.h> is included
63
+ unsigned long pos ;
64
+ _BitScanBackward (& pos , mask );
65
+ i -= (size_t )pos - 16 ;
66
+ #elif defined(__clang__ ) || ((__GNUC__ >= 4 ) || ((__GNUC__ == 3 ) && (__GNUC_MINOR__ >= 4 ))) // modern GCC has built-in __builtin_ctz
67
+ i -= __builtin_clz (mask )- 16 ; // mask is 32-bits but only lower 16-bits are significant
68
+ #else // none of choices exist, use local BSR implementation
69
+ #error __builtin_clz
70
+ #endif
71
+ return i ;
72
+ }
73
+ i -= 16 ;
74
+ }
75
+
76
+ while (i > 0 ){if (ch != str [i - 1 ]) return i ; else -- i ;}
77
+ return 0 ;
78
+ }
79
+ #elif defined(__SSE2__ ) || EMU_AVX
22
80
23
81
static size_t srchr (char * str , char ch , size_t len ){
24
82
size_t i = len ;
0 commit comments