Skip to content

Commit 502bb62

Browse files
committed
More optimisations for IntSimdMatrix
* Move IntDotProductSSE. That allows inlining of the code. * Improve IntDotProductSSE by moving some instructions. * Remove unused num_input_groups_ from IntSimdMatrix. * Re-order elements in IntSimdMatrix to avoid padding. Signed-off-by: Stefan Weil <[email protected]>
1 parent 9560639 commit 502bb62

File tree

6 files changed

+63
-72
lines changed

6 files changed

+63
-72
lines changed

src/arch/dotproductsse.cpp

-40
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
// File: dotproductsse.cpp
33
// Description: Architecture-specific dot-product function.
44
// Author: Ray Smith
5-
// Created: Wed Jul 22 10:57:45 PDT 2015
65
//
76
// (C) Copyright 2015, Google Inc.
87
// Licensed under the Apache License, Version 2.0 (the "License");
@@ -79,43 +78,4 @@ double DotProductSSE(const double* u, const double* v, int n) {
7978
return result;
8079
}
8180

82-
// Computes and returns the dot product of the n-vectors u and v.
83-
// Uses Intel SSE intrinsics to access the SIMD instruction set.
84-
int32_t IntDotProductSSE(const int8_t* u, const int8_t* v, int n) {
85-
int max_offset = n - 8;
86-
int offset = 0;
87-
// Accumulate a set of 4 32-bit sums in sum, by loading 8 pairs of 8-bit
88-
// values, extending to 16 bit, multiplying to make 32 bit results.
89-
__m128i sum = _mm_setzero_si128();
90-
if (offset <= max_offset) {
91-
offset = 8;
92-
__m128i packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(u));
93-
__m128i packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(v));
94-
sum = _mm_cvtepi8_epi16(packed1);
95-
packed2 = _mm_cvtepi8_epi16(packed2);
96-
// The magic _mm_add_epi16 is perfect here. It multiplies 8 pairs of 16 bit
97-
// ints to make 32 bit results, which are then horizontally added in pairs
98-
// to make 4 32 bit results that still fit in a 128 bit register.
99-
sum = _mm_madd_epi16(sum, packed2);
100-
while (offset <= max_offset) {
101-
packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(u + offset));
102-
packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(v + offset));
103-
offset += 8;
104-
packed1 = _mm_cvtepi8_epi16(packed1);
105-
packed2 = _mm_cvtepi8_epi16(packed2);
106-
packed1 = _mm_madd_epi16(packed1, packed2);
107-
sum = _mm_add_epi32(sum, packed1);
108-
}
109-
}
110-
// Sum the 4 packed 32 bit sums and extract the low result.
111-
sum = _mm_hadd_epi32(sum, sum);
112-
sum = _mm_hadd_epi32(sum, sum);
113-
int32_t result = _mm_cvtsi128_si32(sum);
114-
while (offset < n) {
115-
result += u[offset] * v[offset];
116-
++offset;
117-
}
118-
return result;
119-
}
120-
12181
} // namespace tesseract.

src/arch/dotproductsse.h

-6
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
// File: dotproductsse.h
33
// Description: Architecture-specific dot-product function.
44
// Author: Ray Smith
5-
// Created: Wed Jul 22 10:57:05 PDT 2015
65
//
76
// (C) Copyright 2015, Google Inc.
87
// Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,16 +18,11 @@
1918
#ifndef TESSERACT_ARCH_DOTPRODUCTSSE_H_
2019
#define TESSERACT_ARCH_DOTPRODUCTSSE_H_
2120

22-
#include <cstdint> // for int32_t
23-
2421
namespace tesseract {
2522

2623
// Computes and returns the dot product of the n-vectors u and v.
2724
// Uses Intel SSE intrinsics to access the SIMD instruction set.
2825
double DotProductSSE(const double* u, const double* v, int n);
29-
// Computes and returns the dot product of the n-vectors u and v.
30-
// Uses Intel SSE intrinsics to access the SIMD instruction set.
31-
int32_t IntDotProductSSE(const int8_t* u, const int8_t* v, int n);
3226

3327
} // namespace tesseract.
3428

src/arch/intsimdmatrix.h

+11-11
Original file line numberDiff line numberDiff line change
@@ -85,17 +85,6 @@ struct IntSimdMatrix {
8585
return (input + factor - 1) / factor * factor;
8686
}
8787

88-
// Number of 32 bit outputs held in each register.
89-
int num_outputs_per_register_;
90-
// Maximum number of registers that we will use to hold outputs.
91-
int max_output_registers_;
92-
// Number of 8 bit inputs in the inputs register.
93-
int num_inputs_per_register_;
94-
// Number of inputs in each weight group.
95-
int num_inputs_per_group_;
96-
// Number of groups of inputs to be broadcast.
97-
int num_input_groups_;
98-
9988
// Computes matrix.vector v = Wu.
10089
// u is of size W.dim2() - 1 and the output v is of size W.dim1().
10190
// u is imagined to have an extra element at the end with value 1, to
@@ -109,6 +98,17 @@ struct IntSimdMatrix {
10998
const int8_t* wi, const double* scales, const int8_t* u, double* v);
11099
MatrixDotVectorFunction matrixDotVectorFunction;
111100

101+
// Number of 32 bit outputs held in each register.
102+
int num_outputs_per_register_;
103+
// Maximum number of registers that we will use to hold outputs.
104+
int max_output_registers_;
105+
// Number of 8 bit inputs in the inputs register.
106+
int num_inputs_per_register_;
107+
// Number of inputs in each weight group.
108+
int num_inputs_per_group_;
109+
// Number of groups of inputs to be broadcast.
110+
// num_input_groups_ = num_inputs_per_register_ / num_inputs_per_group_
111+
112112
static const IntSimdMatrix* intSimdMatrix;
113113
static const IntSimdMatrix intSimdMatrixAVX2;
114114
static const IntSimdMatrix intSimdMatrixSSE;

src/arch/intsimdmatrixavx2.cpp

+3-5
Original file line numberDiff line numberDiff line change
@@ -329,18 +329,16 @@ static void matrixDotVector(int dim1, int dim2, const int8_t* wi,
329329
}
330330

331331
const IntSimdMatrix IntSimdMatrix::intSimdMatrixAVX2 = {
332+
// Function.
333+
matrixDotVector,
332334
// Number of 32 bit outputs held in each register.
333335
kNumOutputsPerRegister,
334336
// Maximum number of registers that we will use to hold outputs.
335337
kMaxOutputRegisters,
336338
// Number of 8 bit inputs in the inputs register.
337339
kNumInputsPerRegister,
338340
// Number of inputs in each weight group.
339-
kNumInputsPerGroup,
340-
// Number of groups of inputs to be broadcast.
341-
kNumInputGroups,
342-
// Function.
343-
matrixDotVector
341+
kNumInputsPerGroup
344342
};
345343

346344
} // namespace tesseract.

src/arch/intsimdmatrixsse.cpp

+48-9
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,54 @@
2222
#include "intsimdmatrix.h"
2323

2424
#include <cstdint>
25+
#include <emmintrin.h>
26+
#include <smmintrin.h>
2527
#include "dotproductsse.h"
2628

2729
namespace tesseract {
2830

31+
// Computes and returns the dot product of the n-vectors u and v.
32+
// Uses Intel SSE intrinsics to access the SIMD instruction set.
33+
static int32_t IntDotProductSSE(const int8_t* u, const int8_t* v, int n) {
34+
int max_offset = n - 8;
35+
int offset = 0;
36+
// Accumulate a set of 4 32-bit sums in sum, by loading 8 pairs of 8-bit
37+
// values, extending to 16 bit, multiplying to make 32 bit results.
38+
int32_t result = 0;
39+
if (offset <= max_offset) {
40+
offset = 8;
41+
__m128i packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(u));
42+
__m128i packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(v));
43+
__m128i sum = _mm_cvtepi8_epi16(packed1);
44+
packed2 = _mm_cvtepi8_epi16(packed2);
45+
// The magic _mm_add_epi16 is perfect here. It multiplies 8 pairs of 16 bit
46+
// ints to make 32 bit results, which are then horizontally added in pairs
47+
// to make 4 32 bit results that still fit in a 128 bit register.
48+
sum = _mm_madd_epi16(sum, packed2);
49+
while (offset <= max_offset) {
50+
packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(u + offset));
51+
packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(v + offset));
52+
offset += 8;
53+
packed1 = _mm_cvtepi8_epi16(packed1);
54+
packed2 = _mm_cvtepi8_epi16(packed2);
55+
packed1 = _mm_madd_epi16(packed1, packed2);
56+
sum = _mm_add_epi32(sum, packed1);
57+
}
58+
// Sum the 4 packed 32 bit sums and extract the low result.
59+
sum = _mm_hadd_epi32(sum, sum);
60+
sum = _mm_hadd_epi32(sum, sum);
61+
result = _mm_cvtsi128_si32(sum);
62+
}
63+
while (offset < n) {
64+
result += u[offset] * v[offset];
65+
++offset;
66+
}
67+
return result;
68+
}
69+
2970
// Computes part of matrix.vector v = Wu. Computes 1 result.
3071
static void PartialMatrixDotVector1(const int8_t* wi, const double* scales,
31-
const int8_t* u, int num_in, int num_out,
72+
const int8_t* u, int num_in,
3273
double* v) {
3374
double total = IntDotProductSSE(u, wi, num_in);
3475
// Add in the bias and correct for integer values.
@@ -41,26 +82,24 @@ static void matrixDotVector(int dim1, int dim2, const int8_t* wi,
4182
const int num_in = dim2 - 1;
4283
int output = 0;
4384

44-
for (; output + 1 <= num_out; output += 1) {
45-
PartialMatrixDotVector1(wi, scales, u, num_in, num_out - output, v);
85+
for (; output < num_out; output++) {
86+
PartialMatrixDotVector1(wi, scales, u, num_in, v);
4687
wi += dim2;
47-
scales += 1;
48-
v += 1;
88+
scales++;
89+
v++;
4990
}
5091
}
5192

5293
const IntSimdMatrix IntSimdMatrix::intSimdMatrixSSE = {
94+
matrixDotVector,
5395
// Number of 32 bit outputs held in each register.
5496
1,
5597
// Maximum number of registers that we will use to hold outputs.
5698
1,
5799
// Number of 8 bit inputs in the inputs register.
58100
1,
59101
// Number of inputs in each weight group.
60-
1,
61-
// Number of groups of inputs to be broadcast.
62-
1,
63-
matrixDotVector
102+
1
64103
};
65104

66105
} // namespace tesseract.

unittest/intsimdmatrix_test.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ class IntSimdMatrixTest : public ::testing::Test {
8888

8989
// Test the C++ implementation without SIMD.
9090
TEST_F(IntSimdMatrixTest, C) {
91-
static const IntSimdMatrix matrix = {1, 1, 1, 1, 1, nullptr};
91+
static const IntSimdMatrix matrix = {nullptr, 1, 1, 1, 1};
9292
ExpectEqualResults(matrix);
9393
}
9494

0 commit comments

Comments
 (0)