Skip to content

Commit 3c047f0

Browse files
committed
Optimize performance by using inline function DotProduct
This improves performace for the "best" models because it avoids function calls. Signed-off-by: Stefan Weil <[email protected]>
1 parent e161501 commit 3c047f0

File tree

2 files changed

+24
-22
lines changed

2 files changed

+24
-22
lines changed

src/lstm/weightmatrix.cpp

+24-20
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,29 @@ const int kAdamCorrectionIterations = 200000;
3838
// Epsilon in Adam to prevent division by zero.
3939
const double kAdamEpsilon = 1e-8;
4040

41+
// Computes and returns the dot product of the two n-vectors u and v.
42+
static inline double DotProduct(const double* u, const double* v, int n) {
43+
// Note: because the order of addition is different among the 3 DotProduct
44+
// functions, the results can (and do) vary slightly (although they agree
45+
// to within about 4e-15). This produces different results when running
46+
// training, despite all random inputs being precisely equal.
47+
// To get consistent results, use just one of these DotProduct functions.
48+
// On a test multi-layer network, serial is 57% slower than sse, and avx
49+
// is about 8% faster than sse. This suggests that the time is memory
50+
// bandwidth constrained and could benefit from holding the reused vector
51+
// in AVX registers.
52+
53+
if (SIMDDetect::IsAVXAvailable())
54+
return DotProductAVX(u, v, n);
55+
56+
if (SIMDDetect::IsSSEAvailable())
57+
return DotProductSSE(u, v, n);
58+
59+
double total = 0.0;
60+
for (int k = 0; k < n; ++k) total += u[k] * v[k];
61+
return total;
62+
}
63+
4164
// Computes matrix.vector v = Wu.
4265
// u is of size W.dim2() - add_bias_fwd and the output v is of size
4366
// W.dim1() - skip_bias_back.
@@ -54,7 +77,7 @@ static inline void MatrixDotVectorInternal(const GENERIC_2D_ARRAY<double>& w,
5477
int extent = w.dim2() - add_bias_fwd;
5578
for (int i = 0; i < num_results; ++i) {
5679
const double* wi = w[i];
57-
double total = WeightMatrix::DotProduct(wi, u, extent);
80+
double total = DotProduct(wi, u, extent);
5881
if (add_bias_fwd) total += wi[extent]; // The bias value.
5982
v[i] = total;
6083
}
@@ -389,25 +412,6 @@ void WeightMatrix::Debug2D(const char* msg) {
389412
histogram.print();
390413
}
391414

392-
// Computes and returns the dot product of the two n-vectors u and v.
393-
/* static */
394-
double WeightMatrix::DotProduct(const double* u, const double* v, int n) {
395-
// Note: because the order of addition is different among the 3 DotProduct
396-
// functions, the results can (and do) vary slightly (although they agree
397-
// to within about 4e-15). This produces different results when running
398-
// training, despite all random inputs being precisely equal.
399-
// To get consistent results, use just one of these DotProduct functions.
400-
// On a test multi-layer network, serial is 57% slower than sse, and avx
401-
// is about 8% faster than sse. This suggests that the time is memory
402-
// bandwidth constrained and could benefit from holding the reused vector
403-
// in AVX registers.
404-
if (SIMDDetect::IsAVXAvailable()) return DotProductAVX(u, v, n);
405-
if (SIMDDetect::IsSSEAvailable()) return DotProductSSE(u, v, n);
406-
double total = 0.0;
407-
for (int k = 0; k < n; ++k) total += u[k] * v[k];
408-
return total;
409-
}
410-
411415
// Utility function converts an array of float to the corresponding array
412416
// of double.
413417
/* static */

src/lstm/weightmatrix.h

-2
Original file line numberDiff line numberDiff line change
@@ -152,8 +152,6 @@ class WeightMatrix {
152152

153153
void Debug2D(const char* msg);
154154

155-
// Computes and returns the dot product of the two n-vectors u and v.
156-
static double DotProduct(const double* u, const double* v, int n);
157155
// Utility function converts an array of float to the corresponding array
158156
// of double.
159157
static void FloatToDouble(const GENERIC_2D_ARRAY<float>& wf,

0 commit comments

Comments
 (0)