Skip to content

Commit d622ca0

Browse files
committed
Merge branch 'release-2.1.0-rc19'
2 parents 9243ed1 + 2cf7c7e commit d622ca0

13 files changed

+166
-42
lines changed

VERSION

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
MAJOR = 2
22
MINOR = 1
3-
PATCH = 0-rc18
3+
PATCH = 0-rc19
44
# A specific DATE (YYYY-MM-DD) fixes an official release, otherwise
55
# it is considered Development version.
66
DATE =

src/acc/acc.h

+6
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,12 @@
1111

1212
#include <stddef.h>
1313

14+
#define DBCSR_STRINGIFY_AUX(SYMBOL) #SYMBOL
15+
#define DBCSR_STRINGIFY(SYMBOL) DBCSR_STRINGIFY_AUX(SYMBOL)
16+
#define DBCSR_CONCATENATE2(A, B) A##B
17+
#define DBCSR_CONCATENATE(A, B) DBCSR_CONCATENATE2(A, B)
18+
19+
1420
#if defined(__cplusplus)
1521
extern "C" {
1622
#endif

src/acc/acc_bench_smm.c

+112-24
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@
2020
#if !defined(ELEM_TYPE)
2121
# define ELEM_TYPE double
2222
#endif
23+
#if !defined(EPSILON)
24+
# define EPSILON 1E-3
25+
#endif
2326
#if !defined(MAX_KERNEL_DIM)
2427
# define MAX_KERNEL_DIM 80
2528
#endif
@@ -67,44 +70,66 @@ int main(int argc, char* argv[])
6770
const int mn = m * n, mk = m * k, kn = k * n;
6871
#endif
6972
#if defined(WARMUP) && (0 < WARMUP) && !defined(_DEBUG)
70-
const int warmup = WARMUP;
73+
const int warmup = MAX(WARMUP, 2) / 2 * 2;
7174
#else
7275
const int warmup = 0;
7376
#endif
74-
int *stack_hst = NULL, *stack_dev = NULL;
77+
int *stack_hst = NULL, *stack_dev = NULL, *trans_hst = NULL, *trans_dev = NULL;
7578
ELEM_TYPE *amat_hst = NULL, *bmat_hst = NULL, *cmat_hst = NULL;
7679
ELEM_TYPE *amat_dev = NULL, *bmat_dev = NULL, *cmat_dev = NULL;
77-
int result = EXIT_SUCCESS, r, i;
80+
int result = EXIT_SUCCESS, ndevices = 0, r, i;
7881
void *stream = NULL;
7982
#if defined(USE_LIBXSMM)
8083
libxsmm_timer_tickint start;
81-
double duration;
84+
double duration, transpose;
8285
#endif
8386
assert(m <= (mn / n) && 0 == (mn % n) && k <= (mk / k) && 0 == (mk % k) && n <= (kn / n) && 0 == (kn % n));
84-
printf("%s%s%i %i %i %i %i\n", 0 < argc ? argv[0] : "", 0 < argc ? " " : "", nrepeat, stack_size, m, n, k);
87+
printf("%s%s%i %i %i %i %i %i %i %i\n", 0 < argc ? argv[0] : "", 0 < argc ? " " : "",
88+
nrepeat, stack_size, m, n, k, nc, na, nb);
8589
CHECK(acc_init(), &result);
90+
CHECK(acc_get_ndevices(&ndevices), &result);
91+
if (0 < ndevices) {
92+
#if defined(_DEBUG)
93+
fprintf(stderr, "number of devices found: %i\n", ndevices);
94+
#endif
95+
}
96+
else {
97+
#if defined(_DEBUG)
98+
fprintf(stderr, "Error: no device found!\n");
99+
#endif
100+
CHECK(acc_finalize(), NULL);
101+
return result;
102+
}
103+
printf("element type: %s\n", DBCSR_STRINGIFY(ELEM_TYPE));
86104
CHECK(acc_stream_create(&stream, "stream", -1/*default priority*/), &result);
87-
CHECK(acc_host_mem_allocate((void**)&amat_hst, sizeof(ELEM_TYPE) * mk * stack_size, stream), &result);
88-
CHECK(acc_host_mem_allocate((void**)&bmat_hst, sizeof(ELEM_TYPE) * kn * stack_size, stream), &result);
89-
CHECK(acc_host_mem_allocate((void**)&cmat_hst, sizeof(ELEM_TYPE) * mn * stack_size, stream), &result);
105+
CHECK(acc_host_mem_allocate((void**)&amat_hst, sizeof(ELEM_TYPE) * mk * na, stream), &result);
106+
CHECK(acc_host_mem_allocate((void**)&bmat_hst, sizeof(ELEM_TYPE) * kn * nb, stream), &result);
107+
CHECK(acc_host_mem_allocate((void**)&cmat_hst, sizeof(ELEM_TYPE) * mn * nc, stream), &result);
90108
CHECK(acc_host_mem_allocate((void**)&stack_hst, sizeof(int) * 3 * stack_size, stream), &result);
109+
CHECK(acc_host_mem_allocate((void**)&trans_hst, sizeof(int) * nb, stream), &result);
91110
CHECK(acc_stream_sync(stream), &result); /* ensure host-data is allocated */
92-
for (i = 0; i < stack_size; ++i) { /* initialize matrices */
111+
/* initialize matrices */
112+
for (i = 0; i < na; ++i) {
93113
init(i/*seed*/ + 42, &amat_hst[i*mk], m, k);
114+
}
115+
for (i = 0; i < nb; ++i) {
94116
init(i/*seed*/ + 24, &bmat_hst[i*kn], k, n);
117+
trans_hst[i] = i * kn;
95118
}
96119
init_stack(stack_hst, stack_size, mn, mk, kn, nc, na, nb);
97-
CHECK(acc_dev_mem_allocate((void**)&amat_dev, sizeof(ELEM_TYPE) * mk * stack_size), &result);
98-
CHECK(acc_dev_mem_allocate((void**)&bmat_dev, sizeof(ELEM_TYPE) * kn * stack_size), &result);
99-
CHECK(acc_dev_mem_allocate((void**)&cmat_dev, sizeof(ELEM_TYPE) * mn * stack_size), &result);
120+
CHECK(acc_dev_mem_allocate((void**)&amat_dev, sizeof(ELEM_TYPE) * mk * na), &result);
121+
CHECK(acc_dev_mem_allocate((void**)&bmat_dev, sizeof(ELEM_TYPE) * kn * nb), &result);
122+
CHECK(acc_dev_mem_allocate((void**)&cmat_dev, sizeof(ELEM_TYPE) * mn * nc), &result);
100123
CHECK(acc_dev_mem_allocate((void**)&stack_dev, sizeof(int) * 3 * stack_size), &result);
101-
CHECK(acc_memset_zero(cmat_dev, 0/*offset*/, sizeof(ELEM_TYPE) * mn * stack_size, stream), &result);
124+
CHECK(acc_dev_mem_allocate((void**)&trans_dev, sizeof(int) * nb), &result);
125+
CHECK(acc_memset_zero(cmat_dev, 0/*offset*/, sizeof(ELEM_TYPE) * mn * nc, stream), &result);
126+
CHECK(acc_memcpy_h2d(trans_hst, trans_dev, sizeof(int) * nb, stream), &result);
102127
#if defined(USE_LIBXSMM)
103128
CHECK(acc_stream_sync(stream), &result);
104129
start = libxsmm_timer_tick();
105130
#endif
106-
CHECK(acc_memcpy_h2d(amat_hst, amat_dev, sizeof(ELEM_TYPE) * mk * stack_size, stream), &result);
107-
CHECK(acc_memcpy_h2d(bmat_hst, bmat_dev, sizeof(ELEM_TYPE) * kn * stack_size, stream), &result);
131+
CHECK(acc_memcpy_h2d(amat_hst, amat_dev, sizeof(ELEM_TYPE) * mk * na, stream), &result);
132+
CHECK(acc_memcpy_h2d(bmat_hst, bmat_dev, sizeof(ELEM_TYPE) * kn * nb, stream), &result);
108133
CHECK(acc_memcpy_h2d(stack_hst, stack_dev, sizeof(int) * 3 * stack_size, stream), &result);
109134
#if defined(USE_LIBXSMM)
110135
CHECK(acc_stream_sync(stream), &result);
@@ -113,55 +138,118 @@ int main(int argc, char* argv[])
113138
(sizeof(ELEM_TYPE) * (mk + kn) + sizeof(int) * 3)
114139
* stack_size / (duration * (1ULL << 30)));
115140
#endif
116-
/* warmup execution and prebuild JIT kernels */
141+
/* warmup execution and prebuild transpose-kernel */
142+
for (r = 0; r < warmup / 2; ++r) {
143+
CHECK(libsmm_acc_transpose(trans_dev, 0/*offset*/, nb, bmat_dev,
144+
DBCSR_TYPE(ELEM_TYPE), k, n, MAX_KERNEL_DIM, stream), &result);
145+
CHECK(libsmm_acc_transpose(trans_dev, 0/*offset*/, nb, bmat_dev,
146+
DBCSR_TYPE(ELEM_TYPE), n, k, MAX_KERNEL_DIM, stream), &result);
147+
}
148+
#if defined(USE_LIBXSMM)
149+
CHECK(acc_stream_sync(stream), &result);
150+
start = libxsmm_timer_tick();
151+
#endif
152+
/* to perform NN-SMMs on the device, all B-matrices are transposed upfront (SMM-kernel is limited to NT) */
153+
CHECK(libsmm_acc_transpose(trans_dev, 0/*offset*/, nb, bmat_dev,
154+
DBCSR_TYPE(ELEM_TYPE), k, n, MAX_KERNEL_DIM, stream), &result);
155+
#if defined(USE_LIBXSMM)
156+
CHECK(acc_stream_sync(stream), &result);
157+
transpose = libxsmm_timer_duration(start, libxsmm_timer_tick());
158+
#endif
159+
/* warmup execution and prebuild SMM-kernel */
117160
for (r = 0; r < warmup; ++r) {
118161
CHECK(libsmm_acc_process(stack_hst, stack_dev, stack_size, 3/*nparams*/, DBCSR_TYPE(ELEM_TYPE),
119162
amat_dev, bmat_dev, cmat_dev, m, n, k, MAX_KERNEL_DIM, 1/*homogeneous*/, stream, stream), &result);
120163
}
164+
CHECK(acc_memset_zero(cmat_dev, 0/*offset*/, sizeof(ELEM_TYPE) * mn * nc, stream), &result);
121165
#if defined(USE_LIBXSMM)
122166
CHECK(acc_stream_sync(stream), &result);
123167
start = libxsmm_timer_tick();
124168
#endif
125169
for (r = 0; r < nrepeat; ++r) {
126-
/* GPU-kernel is limited to C += Ai * Bi^T (i.e., NT, for NN, all Bi must be transposed upfront) */
170+
/* GPU-kernel is limited to C += Ai * Bi^T, i.e., NT (for NN, all Bi must be transposed upfront) */
127171
CHECK(libsmm_acc_process(stack_hst, stack_dev, stack_size, 3/*nparams*/, DBCSR_TYPE(ELEM_TYPE),
128172
amat_dev, bmat_dev, cmat_dev, m, n, k, MAX_KERNEL_DIM, 1/*homogeneous*/, stream, stream), &result);
129173
}
130174
#if defined(USE_LIBXSMM)
131175
CHECK(acc_stream_sync(stream), &result);
132176
duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
133177
if (EXIT_SUCCESS == result) {
134-
const char transa = 'N', transb = 'T';
178+
ELEM_TYPE *const gold_hst = (ELEM_TYPE*)libxsmm_malloc(sizeof(ELEM_TYPE) * mn * nc);
179+
const char transa = 'N', transb = 'N';
135180
const ELEM_TYPE alpha = 1, beta = 1;
181+
printf("transpose: %.1f ms %.1f GFLOPS/s\n", 1000.0 * (duration + transpose) / nrepeat,
182+
((size_t)2 * m * n * k) * stack_size / ((duration + transpose) * (1ULL << 30) / nrepeat));
136183
printf("device: %.1f ms %.1f GFLOPS/s\n", 1000.0 * duration / nrepeat,
137184
((size_t)2 * m * n * k) * stack_size / (duration * (1ULL << 30) / nrepeat));
138-
memset(cmat_hst, 0, sizeof(ELEM_TYPE) * mn * stack_size);
185+
memset(gold_hst, 0, sizeof(ELEM_TYPE) * mn * nc);
186+
for (r = 0; r < warmup; ++r) {
187+
libxsmm_gemm_batch_omp(LIBXSMM_GEMM_PRECISION(ELEM_TYPE), LIBXSMM_GEMM_PRECISION(ELEM_TYPE),
188+
&transa, &transb, m, n, k, &alpha, amat_hst, &m/*lda*/, bmat_hst, &k/*ldb*/,
189+
&beta, gold_hst, &m/*ldc*/, 1/*index_base*/, sizeof(int) * 3,
190+
stack_hst + 0, stack_hst + 1, stack_hst + 2, stack_size);
191+
}
192+
memset(gold_hst, 0, sizeof(ELEM_TYPE) * mn * nc);
139193
start = libxsmm_timer_tick();
194+
/* CPU-kernel operates on data that is not initialized in NUMA-aware fashion */
140195
for (r = 0; r < nrepeat; ++r) {
141-
/* CPU-kernel performs C += Ai * Bi^T to match result of GPU-kernel (NT may perform below NN) */
142196
libxsmm_gemm_batch_omp(LIBXSMM_GEMM_PRECISION(ELEM_TYPE), LIBXSMM_GEMM_PRECISION(ELEM_TYPE),
143197
&transa, &transb, m, n, k, &alpha, amat_hst, &m/*lda*/, bmat_hst, &k/*ldb*/,
144-
&beta, cmat_hst, &m/*ldc*/, 1/*index_base*/, sizeof(int) * 3,
198+
&beta, gold_hst, &m/*ldc*/, 1/*index_base*/, sizeof(int) * 3,
145199
stack_hst + 0, stack_hst + 1, stack_hst + 2, stack_size);
146200
}
147201
duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
148202
printf("host: %.1f ms %.1f GFLOPS/s\n", 1000.0 * duration / nrepeat,
149203
((size_t)2 * m * n * k) * stack_size / (duration * (1ULL << 30) / nrepeat));
150-
/* transfer result from device back to host for validation */
151-
CHECK(acc_memcpy_d2h(cmat_dev, cmat_hst, sizeof(ELEM_TYPE) * mn * stack_size, stream), &result);
204+
/* transfer result from device to host for validation */
205+
CHECK(acc_memcpy_d2h(cmat_dev, cmat_hst, sizeof(ELEM_TYPE) * mn * nc, stream), &result);
152206
CHECK(acc_stream_sync(stream), &result);
153-
/* TODO: validation code TBD */
207+
if (EXIT_SUCCESS == result) {
208+
double abserror = 0, relerror = 0;
209+
for (i = 0; i < nc; ++i) {
210+
const ELEM_TYPE *const gold = gold_hst + mn * i;
211+
const ELEM_TYPE *const test = cmat_hst + mn * i;
212+
double diff = 0, a = 0, b = 0;
213+
for (r = 0; r < (m * n); ++r) {
214+
const double ar = (double)gold[r];
215+
const double br = (double)test[r];
216+
const double d = fabs(ar - br);
217+
if (d > diff) {
218+
diff = d;
219+
a = ar;
220+
b = br;
221+
}
222+
}
223+
if (0 < diff) {
224+
# if defined(_DEBUG)
225+
print(stderr, "gold = ", gold, m, n);
226+
print(stderr, "test = ", test, m, n);
227+
fprintf(stderr, "diff = %g (%g != %g)\n", diff, a, b);
228+
# endif
229+
if (abserror < diff) {
230+
relerror = fabs(0 != a ? (diff / a) : (diff / b));
231+
abserror = diff;
232+
}
233+
}
234+
}
235+
printf("max.error: rel=%g\n", relerror);
236+
if (EPSILON < relerror) result = EXIT_FAILURE;
237+
}
238+
libxsmm_free(gold_hst);
154239
}
155240
#endif
156241
CHECK(acc_host_mem_deallocate(stack_hst, stream), NULL);
242+
CHECK(acc_host_mem_deallocate(trans_hst, stream), NULL);
157243
CHECK(acc_host_mem_deallocate(amat_hst, stream), NULL);
158244
CHECK(acc_host_mem_deallocate(bmat_hst, stream), NULL);
159245
CHECK(acc_host_mem_deallocate(cmat_hst, stream), NULL);
160246
CHECK(acc_dev_mem_deallocate(stack_dev), NULL);
247+
CHECK(acc_dev_mem_deallocate(trans_dev), NULL);
161248
CHECK(acc_dev_mem_deallocate(amat_dev), NULL);
162249
CHECK(acc_dev_mem_deallocate(bmat_dev), NULL);
163250
CHECK(acc_dev_mem_deallocate(cmat_dev), NULL);
164251
CHECK(acc_stream_destroy(stream), NULL);
252+
CHECK(acc_finalize(), NULL);
165253
if (EXIT_SUCCESS != result) {
166254
fprintf(stderr, "FAILED\n");
167255
}

src/acc/acc_bench_trans.c

+20-6
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ int main(int argc, char* argv[])
7575
#endif
7676
int *stack_hst = NULL, *stack_dev = NULL;
7777
ELEM_TYPE *mat_hst = NULL, *mat_dev = NULL;
78-
int result = EXIT_SUCCESS, r, i, mm = m, nn = n;
78+
int result = EXIT_SUCCESS, ndevices = 0, r, i, mm = m, nn = n;
7979
void *stream = NULL;
8080
#if defined(USE_LIBXSMM)
8181
libxsmm_timer_tickint start;
@@ -84,6 +84,20 @@ int main(int argc, char* argv[])
8484
assert(m <= (mn / n) && 0 == (mn % n));
8585
printf("%s%s%i %i %i %i\n", 0 < argc ? argv[0] : "", 0 < argc ? " " : "", nrepeat, stack_size, m, n);
8686
CHECK(acc_init(), &result);
87+
CHECK(acc_get_ndevices(&ndevices), &result);
88+
if (0 < ndevices) {
89+
#if defined(_DEBUG)
90+
fprintf(stderr, "number of devices found: %i\n", ndevices);
91+
#endif
92+
}
93+
else {
94+
#if defined(_DEBUG)
95+
fprintf(stderr, "Error: no device found!\n");
96+
#endif
97+
CHECK(acc_finalize(), NULL);
98+
return result;
99+
}
100+
printf("element type: %s\n", DBCSR_STRINGIFY(ELEM_TYPE));
87101
#if defined(PRIORITY)
88102
CHECK(acc_stream_priority_range(&priomin, &priomax), &result);
89103
CHECK(acc_stream_create(&stream, "stream", (priomin + priomax) / 2), &result);
@@ -154,24 +168,23 @@ int main(int argc, char* argv[])
154168
printf("host: %.1f ms %.1f GB/s\n", 1000.0 * duration / nodd,
155169
(sizeof(ELEM_TYPE) * mn + sizeof(int))
156170
* stack_size / (duration * (1ULL << 30) / nodd));
157-
/* transfer result from device back to host for validation */
171+
/* transfer result from device to host for validation */
158172
CHECK(acc_memcpy_d2h(mat_dev, mat_hst,
159173
sizeof(ELEM_TYPE) * mn * stack_size, stream), &result);
160174
CHECK(acc_stream_sync(stream), &result);
161175
if (EXIT_SUCCESS == result) {
162176
unsigned int nerrors = 0;
163-
int j;
164177
for (i = 0; i < stack_size; ++i) {
165178
ELEM_TYPE gold[MAX_KERNEL_DIM*MAX_KERNEL_DIM];
166179
const ELEM_TYPE *const test = mat_hst + mn * i;
167180
init(i/*seed*/, gold, m, n);
168181
libxsmm_itrans(gold, sizeof(ELEM_TYPE), m, n, m, n);
169-
for (j = 0; j < (m * n); ++j) {
170-
if (gold[j] != test[j]) {
182+
for (r = 0; r < (m * n); ++r) {
183+
if (gold[r] != test[r]) {
171184
++nerrors;
172185
# if defined(_DEBUG)
173186
print(stderr, "gold = ", gold, n, m);
174-
print(stderr, "this = ", test, n, m);
187+
print(stderr, "test = ", test, n, m);
175188
init(i/*seed*/, gold, m, n);
176189
print(stderr, "orig = ", gold, m, n);
177190
fprintf(stderr, "\n");
@@ -190,6 +203,7 @@ int main(int argc, char* argv[])
190203
CHECK(acc_dev_mem_deallocate(stack_dev), NULL);
191204
CHECK(acc_dev_mem_deallocate(mat_dev), NULL);
192205
CHECK(acc_stream_destroy(stream), NULL);
206+
CHECK(acc_finalize(), NULL);
193207
if (EXIT_SUCCESS != result) {
194208
fprintf(stderr, "FAILED\n");
195209
}

src/acc/acc_libsmm.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111

1212
#include "acc.h"
1313

14-
#define DBCSR_CONCATENATE(A, B) A##B
1514
#define DBCSR_TYPE(T) DBCSR_CONCATENATE(DBCSR_TYPE_, T)
1615
#define DBCSR_TYPE_double dbcsr_type_real_8
1716
#define DBCSR_TYPE_float dbcsr_type_real_4
@@ -29,6 +28,7 @@ typedef enum libsmm_acc_data_t {
2928
} libsmm_acc_data_t;
3029

3130
int libsmm_acc_init(void);
31+
int libsmm_acc_finalize(void);
3232
acc_bool_t libsmm_acc_is_thread_safe(void);
3333

3434
int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size,

src/acc/cuda/acc_init.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -45,5 +45,5 @@ extern "C" int acc_finalize(){
4545
ACC_API_CALL(GetDevice, (&myDevice));
4646
ACC_DRV_CALL(DeviceGet, (&acc_device, myDevice));
4747
ACC_DRV_CALL(DevicePrimaryCtxRelease, (acc_device));
48-
return 0;
48+
return libsmm_acc_finalize();
4949
}

src/acc/libsmm_acc/PACKAGE

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"description": "Generic GPU-accelerated library for small matrix multiplications",
2+
"description": "CUDA/HIP-accelerated library for small matrix multiplications",
33
"archive": "libdbcsr",
44
"requires": ["..", "../cuda", "../hip"]
55
}
+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
cusmm_kernels.h

src/acc/libsmm_acc/libcusmm/PACKAGE

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
22
"description": "Cuda accelerated Small Matrix Multiplications",
33
"archive": "libdbcsr",
4-
"requires": ["kernels", "../include", "../../include"]
4+
"requires": ["kernels", "..", "../../include"]
55
}

src/acc/libsmm_acc/libsmm_acc_init.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ int libsmm_acc_gpu_blas_init(){
5454

5555

5656
//===========================================================================
57-
int libsmm_acc_init() {
57+
extern "C" int libsmm_acc_init() {
5858
#if !defined(NO_DBCSR_TIMESET)
5959
std::string routineN = "libsmm_acc_init";
6060
int handle;
@@ -71,7 +71,7 @@ int libsmm_acc_init() {
7171

7272

7373
//===========================================================================
74-
int libsmm_acc_finalize() {
74+
extern "C" int libsmm_acc_finalize() {
7575
#if !defined(NO_DBCSR_TIMESET)
7676
std::string routineN = "libsmm_acc_finalize";
7777
int handle;

src/acc/libsmm_acc/libsmm_acc_init.h

+1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ void timestop(int handle);
2424
#endif
2525

2626
extern "C" int libsmm_acc_init (void);
27+
extern "C" int libsmm_acc_finalize (void);
2728

2829
int libsmm_acc_gpu_blas_init();
2930

0 commit comments

Comments
 (0)