22
22
#include " intsimdmatrix.h"
23
23
24
24
#include < cstdint>
25
+ #include < emmintrin.h>
26
+ #include < smmintrin.h>
25
27
#include " dotproductsse.h"
26
28
27
29
namespace tesseract {
28
30
31
+ // Computes and returns the dot product of the n-vectors u and v.
32
+ // Uses Intel SSE intrinsics to access the SIMD instruction set.
33
+ static int32_t IntDotProductSSE (const int8_t * u, const int8_t * v, int n) {
34
+ int max_offset = n - 8 ;
35
+ int offset = 0 ;
36
+ // Accumulate a set of 4 32-bit sums in sum, by loading 8 pairs of 8-bit
37
+ // values, extending to 16 bit, multiplying to make 32 bit results.
38
+ int32_t result = 0 ;
39
+ if (offset <= max_offset) {
40
+ offset = 8 ;
41
+ __m128i packed1 = _mm_loadl_epi64 (reinterpret_cast <const __m128i*>(u));
42
+ __m128i packed2 = _mm_loadl_epi64 (reinterpret_cast <const __m128i*>(v));
43
+ __m128i sum = _mm_cvtepi8_epi16 (packed1);
44
+ packed2 = _mm_cvtepi8_epi16 (packed2);
45
+ // The magic _mm_add_epi16 is perfect here. It multiplies 8 pairs of 16 bit
46
+ // ints to make 32 bit results, which are then horizontally added in pairs
47
+ // to make 4 32 bit results that still fit in a 128 bit register.
48
+ sum = _mm_madd_epi16 (sum, packed2);
49
+ while (offset <= max_offset) {
50
+ packed1 = _mm_loadl_epi64 (reinterpret_cast <const __m128i*>(u + offset));
51
+ packed2 = _mm_loadl_epi64 (reinterpret_cast <const __m128i*>(v + offset));
52
+ offset += 8 ;
53
+ packed1 = _mm_cvtepi8_epi16 (packed1);
54
+ packed2 = _mm_cvtepi8_epi16 (packed2);
55
+ packed1 = _mm_madd_epi16 (packed1, packed2);
56
+ sum = _mm_add_epi32 (sum, packed1);
57
+ }
58
+ // Sum the 4 packed 32 bit sums and extract the low result.
59
+ sum = _mm_hadd_epi32 (sum, sum);
60
+ sum = _mm_hadd_epi32 (sum, sum);
61
+ result = _mm_cvtsi128_si32 (sum);
62
+ }
63
+ while (offset < n) {
64
+ result += u[offset] * v[offset];
65
+ ++offset;
66
+ }
67
+ return result;
68
+ }
69
+
29
70
// Computes part of matrix.vector v = Wu. Computes 1 result.
30
71
static void PartialMatrixDotVector1 (const int8_t * wi, const double * scales,
31
- const int8_t * u, int num_in, int num_out,
72
+ const int8_t * u, int num_in,
32
73
double * v) {
33
74
double total = IntDotProductSSE (u, wi, num_in);
34
75
// Add in the bias and correct for integer values.
@@ -41,26 +82,24 @@ static void matrixDotVector(int dim1, int dim2, const int8_t* wi,
41
82
const int num_in = dim2 - 1 ;
42
83
int output = 0 ;
43
84
44
- for (; output + 1 <= num_out; output += 1 ) {
45
- PartialMatrixDotVector1 (wi, scales, u, num_in, num_out - output, v);
85
+ for (; output < num_out; output++ ) {
86
+ PartialMatrixDotVector1 (wi, scales, u, num_in, v);
46
87
wi += dim2;
47
- scales += 1 ;
48
- v += 1 ;
88
+ scales++ ;
89
+ v++ ;
49
90
}
50
91
}
51
92
52
93
const IntSimdMatrix IntSimdMatrix::intSimdMatrixSSE = {
94
+ matrixDotVector,
53
95
// Number of 32 bit outputs held in each register.
54
96
1 ,
55
97
// Maximum number of registers that we will use to hold outputs.
56
98
1 ,
57
99
// Number of 8 bit inputs in the inputs register.
58
100
1 ,
59
101
// Number of inputs in each weight group.
60
- 1 ,
61
- // Number of groups of inputs to be broadcast.
62
- 1 ,
63
- matrixDotVector
102
+ 1
64
103
};
65
104
66
105
} // namespace tesseract.
0 commit comments