More optimisations for IntSimdMatrix

stweil · stweil · commit 502bb624c277 · 2019-01-14T21:34:37.000+01:00
* Move IntDotProductSSE. That allows inlining of the code.
* Improve IntDotProductSSE by moving some instructions.
* Remove unused num_input_groups_ from IntSimdMatrix.
* Re-order elements in IntSimdMatrix to avoid padding.

Signed-off-by: Stefan Weil &lt;sw@weilnetz.de&gt;
diff --git a/src/arch/dotproductsse.cpp b/src/arch/dotproductsse.cpp
@@ -2,7 +2,6 @@
 // File:        dotproductsse.cpp
 // Description: Architecture-specific dot-product function.
 // Author:      Ray Smith
-// Created:     Wed Jul 22 10:57:45 PDT 2015
 //
 // (C) Copyright 2015, Google Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -79,43 +78,4 @@ double DotProductSSE(const double* u, const double* v, int n) {
   return result;
 }
 
-// Computes and returns the dot product of the n-vectors u and v.
-// Uses Intel SSE intrinsics to access the SIMD instruction set.
-int32_t IntDotProductSSE(const int8_t* u, const int8_t* v, int n) {
-  int max_offset = n - 8;
-  int offset = 0;
-  // Accumulate a set of 4 32-bit sums in sum, by loading 8 pairs of 8-bit
-  // values, extending to 16 bit, multiplying to make 32 bit results.
-  __m128i sum = _mm_setzero_si128();
-  if (offset <= max_offset) {
-    offset = 8;
-    __m128i packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(u));
-    __m128i packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(v));
-    sum = _mm_cvtepi8_epi16(packed1);
-    packed2 = _mm_cvtepi8_epi16(packed2);
-    // The magic _mm_add_epi16 is perfect here. It multiplies 8 pairs of 16 bit
-    // ints to make 32 bit results, which are then horizontally added in pairs
-    // to make 4 32 bit results that still fit in a 128 bit register.
-    sum = _mm_madd_epi16(sum, packed2);
-    while (offset <= max_offset) {
-      packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(u + offset));
-      packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(v + offset));
-      offset += 8;
-      packed1 = _mm_cvtepi8_epi16(packed1);
-      packed2 = _mm_cvtepi8_epi16(packed2);
-      packed1 = _mm_madd_epi16(packed1, packed2);
-      sum = _mm_add_epi32(sum, packed1);
-    }
-  }
-  // Sum the 4 packed 32 bit sums and extract the low result.
-  sum = _mm_hadd_epi32(sum, sum);
-  sum = _mm_hadd_epi32(sum, sum);
-  int32_t result = _mm_cvtsi128_si32(sum);
-  while (offset < n) {
-    result += u[offset] * v[offset];
-    ++offset;
-  }
-  return result;
-}
-
 }  // namespace tesseract.
diff --git a/src/arch/dotproductsse.h b/src/arch/dotproductsse.h
@@ -2,7 +2,6 @@
 // File:        dotproductsse.h
 // Description: Architecture-specific dot-product function.
 // Author:      Ray Smith
-// Created:     Wed Jul 22 10:57:05 PDT 2015
 //
 // (C) Copyright 2015, Google Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,16 +18,11 @@
 #ifndef TESSERACT_ARCH_DOTPRODUCTSSE_H_
 #define TESSERACT_ARCH_DOTPRODUCTSSE_H_
 
-#include <cstdint>      // for int32_t
-
 namespace tesseract {
 
 // Computes and returns the dot product of the n-vectors u and v.
 // Uses Intel SSE intrinsics to access the SIMD instruction set.
 double DotProductSSE(const double* u, const double* v, int n);
-// Computes and returns the dot product of the n-vectors u and v.
-// Uses Intel SSE intrinsics to access the SIMD instruction set.
-int32_t IntDotProductSSE(const int8_t* u, const int8_t* v, int n);
 
 }  // namespace tesseract.
 
diff --git a/src/arch/intsimdmatrix.h b/src/arch/intsimdmatrix.h
@@ -85,17 +85,6 @@ struct IntSimdMatrix {
     return (input + factor - 1) / factor * factor;
   }
 
-  // Number of 32 bit outputs held in each register.
-  int num_outputs_per_register_;
-  // Maximum number of registers that we will use to hold outputs.
-  int max_output_registers_;
-  // Number of 8 bit inputs in the inputs register.
-  int num_inputs_per_register_;
-  // Number of inputs in each weight group.
-  int num_inputs_per_group_;
-  // Number of groups of inputs to be broadcast.
-  int num_input_groups_;
-
   // Computes matrix.vector v = Wu.
   // u is of size W.dim2() - 1 and the output v is of size W.dim1().
   // u is imagined to have an extra element at the end with value 1, to
@@ -109,6 +98,17 @@ struct IntSimdMatrix {
     const int8_t* wi, const double* scales, const int8_t* u, double* v);
   MatrixDotVectorFunction matrixDotVectorFunction;
 
+  // Number of 32 bit outputs held in each register.
+  int num_outputs_per_register_;
+  // Maximum number of registers that we will use to hold outputs.
+  int max_output_registers_;
+  // Number of 8 bit inputs in the inputs register.
+  int num_inputs_per_register_;
+  // Number of inputs in each weight group.
+  int num_inputs_per_group_;
+  // Number of groups of inputs to be broadcast.
+  // num_input_groups_ = num_inputs_per_register_ / num_inputs_per_group_
+
   static const IntSimdMatrix* intSimdMatrix;
   static const IntSimdMatrix intSimdMatrixAVX2;
   static const IntSimdMatrix intSimdMatrixSSE;
diff --git a/src/arch/intsimdmatrixavx2.cpp b/src/arch/intsimdmatrixavx2.cpp
@@ -329,18 +329,16 @@ static void matrixDotVector(int dim1, int dim2, const int8_t* wi,
 }
 
 const IntSimdMatrix IntSimdMatrix::intSimdMatrixAVX2 = {
+  // Function.
+  matrixDotVector,
   // Number of 32 bit outputs held in each register.
   kNumOutputsPerRegister,
   // Maximum number of registers that we will use to hold outputs.
   kMaxOutputRegisters,
   // Number of 8 bit inputs in the inputs register.
   kNumInputsPerRegister,
   // Number of inputs in each weight group.
-  kNumInputsPerGroup,
-  // Number of groups of inputs to be broadcast.
-  kNumInputGroups,
-  // Function.
-  matrixDotVector
+  kNumInputsPerGroup
 };
 
 }  // namespace tesseract.
diff --git a/src/arch/intsimdmatrixsse.cpp b/src/arch/intsimdmatrixsse.cpp
@@ -22,13 +22,54 @@
 #include "intsimdmatrix.h"
 
 #include <cstdint>
+#include <emmintrin.h>
+#include <smmintrin.h>
 #include "dotproductsse.h"
 
 namespace tesseract {
 
+// Computes and returns the dot product of the n-vectors u and v.
+// Uses Intel SSE intrinsics to access the SIMD instruction set.
+static int32_t IntDotProductSSE(const int8_t* u, const int8_t* v, int n) {
+  int max_offset = n - 8;
+  int offset = 0;
+  // Accumulate a set of 4 32-bit sums in sum, by loading 8 pairs of 8-bit
+  // values, extending to 16 bit, multiplying to make 32 bit results.
+  int32_t result = 0;
+  if (offset <= max_offset) {
+    offset = 8;
+    __m128i packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(u));
+    __m128i packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(v));
+    __m128i sum = _mm_cvtepi8_epi16(packed1);
+    packed2 = _mm_cvtepi8_epi16(packed2);
+    // The magic _mm_add_epi16 is perfect here. It multiplies 8 pairs of 16 bit
+    // ints to make 32 bit results, which are then horizontally added in pairs
+    // to make 4 32 bit results that still fit in a 128 bit register.
+    sum = _mm_madd_epi16(sum, packed2);
+    while (offset <= max_offset) {
+      packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(u + offset));
+      packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(v + offset));
+      offset += 8;
+      packed1 = _mm_cvtepi8_epi16(packed1);
+      packed2 = _mm_cvtepi8_epi16(packed2);
+      packed1 = _mm_madd_epi16(packed1, packed2);
+      sum = _mm_add_epi32(sum, packed1);
+    }
+    // Sum the 4 packed 32 bit sums and extract the low result.
+    sum = _mm_hadd_epi32(sum, sum);
+    sum = _mm_hadd_epi32(sum, sum);
+    result = _mm_cvtsi128_si32(sum);
+  }
+  while (offset < n) {
+    result += u[offset] * v[offset];
+    ++offset;
+  }
+  return result;
+}
+
 // Computes part of matrix.vector v = Wu. Computes 1 result.
 static void PartialMatrixDotVector1(const int8_t* wi, const double* scales,
-                                    const int8_t* u, int num_in, int num_out,
+                                    const int8_t* u, int num_in,
                                     double* v) {
   double total = IntDotProductSSE(u, wi, num_in);
   // Add in the bias and correct for integer values.
@@ -41,26 +82,24 @@ static void matrixDotVector(int dim1, int dim2, const int8_t* wi,
   const int num_in = dim2 - 1;
   int output = 0;
 
-  for (; output + 1 <= num_out; output += 1) {
-    PartialMatrixDotVector1(wi, scales, u, num_in, num_out - output, v);
+  for (; output < num_out; output++) {
+    PartialMatrixDotVector1(wi, scales, u, num_in, v);
     wi += dim2;
-    scales += 1;
-    v += 1;
+    scales++;
+    v++;
   }
 }
 
 const IntSimdMatrix IntSimdMatrix::intSimdMatrixSSE = {
+  matrixDotVector,
   // Number of 32 bit outputs held in each register.
   1,
   // Maximum number of registers that we will use to hold outputs.
   1,
   // Number of 8 bit inputs in the inputs register.
   1,
   // Number of inputs in each weight group.
-  1,
-  // Number of groups of inputs to be broadcast.
-  1,
-  matrixDotVector
+  1
 };
 
 }  // namespace tesseract.
diff --git a/unittest/intsimdmatrix_test.cc b/unittest/intsimdmatrix_test.cc
@@ -88,7 +88,7 @@ class IntSimdMatrixTest : public ::testing::Test {
 
 // Test the C++ implementation without SIMD.
 TEST_F(IntSimdMatrixTest, C) {
-  static const IntSimdMatrix matrix = {1, 1, 1, 1, 1, nullptr};
+  static const IntSimdMatrix matrix = {nullptr, 1, 1, 1, 1};
   ExpectEqualResults(matrix);
 }
 

Original file line number	Diff line number	Diff line change
`@@ -88,7 +88,7 @@ class IntSimdMatrixTest : public ::testing::Test {`
`88`	`88`
`89`	`89`	`// Test the C++ implementation without SIMD.`
`90`	`90`	`TEST_F(IntSimdMatrixTest, C) {`
`91`		`- static const IntSimdMatrix matrix = {1, 1, 1, 1, 1, nullptr};`
	`91`	`+ static const IntSimdMatrix matrix = {nullptr, 1, 1, 1, 1};`
`92`	`92`	`ExpectEqualResults(matrix);`
`93`	`93`	`}`
`94`	`94`