1
-
2
1
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
3
2
// University of Bristol HPC
4
3
//
5
4
// For full license terms please see the LICENSE file distributed with this
6
5
// source code
7
6
8
-
9
7
#include " CUDAStream.h"
10
8
11
9
[[noreturn]] inline void error (char const * file, int line, char const * expr, cudaError_t e) {
17
15
#define CU (EXPR ) do { auto __e = (EXPR); if (__e != cudaSuccess) error (__FILE__, __LINE__, #EXPR, __e); } while (false )
18
16
19
17
// It is best practice to include __device__ and constexpr even though in BabelStream it only needs to be __host__ const
20
- __host__ __device__ constexpr size_t ceil_div (size_t a, size_t b) { return (a + b - 1 )/ b; }
18
+ __host__ __device__ constexpr size_t ceil_div (size_t a, size_t b) { return (a + b - 1 ) / b; }
21
19
22
20
cudaStream_t stream;
23
21
24
22
template <class T >
25
- CUDAStream<T>::CUDAStream(const int ARRAY_SIZE, const int device_index)
23
+ CUDAStream<T>::CUDAStream(const int array_size, const int device_index)
24
+ : array_size(array_size)
26
25
{
27
26
// Set device
28
27
int count;
@@ -43,20 +42,16 @@ CUDAStream<T>::CUDAStream(const int ARRAY_SIZE, const int device_index)
43
42
#else
44
43
std::cout << " Memory: DEFAULT" << std::endl;
45
44
#endif
46
- array_size = ARRAY_SIZE;
47
-
48
45
49
46
// Query device for sensible dot kernel block count
50
47
cudaDeviceProp props;
51
48
CU (cudaGetDeviceProperties (&props, device_index));
52
49
dot_num_blocks = props.multiProcessorCount * 4 ;
53
50
54
- // Allocate the host array for partial sums for dot kernels
55
- sums = (T*)malloc (sizeof (T) * dot_num_blocks);
56
-
57
- size_t array_bytes = sizeof (T);
58
- array_bytes *= ARRAY_SIZE;
59
- size_t total_bytes = array_bytes * 4 ;
51
+ // Size of partial sums for dot kernels
52
+ size_t sums_bytes = sizeof (T) * dot_num_blocks;
53
+ size_t array_bytes = sizeof (T) * array_size;
54
+ size_t total_bytes = array_bytes * size_t (3 ) + sums_bytes;
60
55
std::cout << " Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE << std::endl;
61
56
62
57
// Check buffers fit on the device
@@ -68,45 +63,42 @@ CUDAStream<T>::CUDAStream(const int ARRAY_SIZE, const int device_index)
68
63
CU (cudaMallocManaged (&d_a, array_bytes));
69
64
CU (cudaMallocManaged (&d_b, array_bytes));
70
65
CU (cudaMallocManaged (&d_c, array_bytes));
71
- CU (cudaMallocManaged (&d_sum, dot_num_blocks* sizeof (T) ));
66
+ CU (cudaHostAlloc (&sums, sums_bytes, cudaHostAllocDefault ));
72
67
#elif defined(PAGEFAULT)
73
68
d_a = (T*)malloc (array_bytes);
74
69
d_b = (T*)malloc (array_bytes);
75
70
d_c = (T*)malloc (array_bytes);
76
- d_sum = (T*)malloc (sizeof (T)*dot_num_blocks );
71
+ sums = (T*)malloc (sums_bytes );
77
72
#else
78
73
CU (cudaMalloc (&d_a, array_bytes));
79
74
CU (cudaMalloc (&d_b, array_bytes));
80
75
CU (cudaMalloc (&d_c, array_bytes));
81
- CU (cudaMalloc (&d_sum, dot_num_blocks* sizeof (T) ));
76
+ CU (cudaHostAlloc (&sums, sums_bytes, cudaHostAllocDefault ));
82
77
#endif
83
78
}
84
79
85
-
86
80
template <class T >
87
81
CUDAStream<T>::~CUDAStream ()
88
82
{
89
83
CU (cudaStreamDestroy (stream));
90
- free (sums);
91
84
92
85
#if defined(PAGEFAULT)
93
86
free (d_a);
94
87
free (d_b);
95
88
free (d_c);
96
- free (d_sum );
89
+ free (sums );
97
90
#else
98
91
CU (cudaFree (d_a));
99
92
CU (cudaFree (d_b));
100
93
CU (cudaFree (d_c));
101
- CU (cudaFree (d_sum ));
94
+ CU (cudaFreeHost (sums ));
102
95
#endif
103
96
}
104
97
105
-
106
98
template <typename T>
107
99
__global__ void init_kernel (T * a, T * b, T * c, T initA, T initB, T initC, int array_size)
108
100
{
109
- for (int i = blockDim .x * blockIdx .x + threadIdx .x ; i < array_size; i += gridDim .x * blockDim .x ) {
101
+ for (int i = threadIdx .x + blockDim .x * blockIdx .x ; i < array_size; i += gridDim .x * blockDim .x ) {
110
102
a[i] = initA;
111
103
b[i] = initB;
112
104
c[i] = initC;
@@ -128,7 +120,7 @@ void CUDAStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vecto
128
120
// Copy device memory to host
129
121
#if defined(PAGEFAULT) || defined(MANAGED)
130
122
CU (cudaStreamSynchronize (stream));
131
- for (int i = 0 ; i < array_size; i++ )
123
+ for (int i = 0 ; i < array_size; ++i )
132
124
{
133
125
a[i] = d_a[i];
134
126
b[i] = d_b[i];
@@ -141,11 +133,10 @@ void CUDAStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vecto
141
133
#endif
142
134
}
143
135
144
-
145
136
template <typename T>
146
137
__global__ void copy_kernel (const T * a, T * c, int array_size)
147
138
{
148
- for (int i = blockDim .x * blockIdx .x + threadIdx .x ; i < array_size; i += gridDim .x * blockDim .x ) {
139
+ for (int i = threadIdx .x + blockDim .x * blockIdx .x ; i < array_size; i += gridDim .x * blockDim .x ) {
149
140
c[i] = a[i];
150
141
}
151
142
}
@@ -163,7 +154,7 @@ template <typename T>
163
154
__global__ void mul_kernel (T * b, const T * c, int array_size)
164
155
{
165
156
const T scalar = startScalar;
166
- for (int i = blockDim .x * blockIdx .x + threadIdx .x ; i < array_size; i += gridDim .x * blockDim .x ) {
157
+ for (int i = threadIdx .x + blockDim .x * blockIdx .x ; i < array_size; i += gridDim .x * blockDim .x ) {
167
158
b[i] = scalar * c[i];
168
159
}
169
160
}
@@ -180,7 +171,7 @@ void CUDAStream<T>::mul()
180
171
template <typename T>
181
172
__global__ void add_kernel (const T * a, const T * b, T * c, int array_size)
182
173
{
183
- for (int i = blockDim .x * blockIdx .x + threadIdx .x ; i < array_size; i += gridDim .x * blockDim .x ) {
174
+ for (int i = threadIdx .x + blockDim .x * blockIdx .x ; i < array_size; i += gridDim .x * blockDim .x ) {
184
175
c[i] = a[i] + b[i];
185
176
}
186
177
}
@@ -198,7 +189,7 @@ template <typename T>
198
189
__global__ void triad_kernel (T * a, const T * b, const T * c, int array_size)
199
190
{
200
191
const T scalar = startScalar;
201
- for (int i = blockDim .x * blockIdx .x + threadIdx .x ; i < array_size; i += gridDim .x * blockDim .x ) {
192
+ for (int i = threadIdx .x + blockDim .x * blockIdx .x ; i < array_size; i += gridDim .x * blockDim .x ) {
202
193
a[i] = b[i] + scalar * c[i];
203
194
}
204
195
}
@@ -216,7 +207,7 @@ template <typename T>
216
207
__global__ void nstream_kernel (T * a, const T * b, const T * c, int array_size)
217
208
{
218
209
const T scalar = startScalar;
219
- for (int i = blockDim .x * blockIdx .x + threadIdx .x ; i < array_size; i += gridDim .x * blockDim .x ) {
210
+ for (int i = threadIdx .x + blockDim .x * blockIdx .x ; i < array_size; i += gridDim .x * blockDim .x ) {
220
211
a[i] += b[i] + scalar * c[i];
221
212
}
222
213
}
@@ -231,50 +222,33 @@ void CUDAStream<T>::nstream()
231
222
}
232
223
233
224
template <class T >
234
- __global__ void dot_kernel (const T * a, const T * b, T * sum , int array_size)
225
+ __global__ void dot_kernel (const T * a, const T * b, T* sums , int array_size)
235
226
{
236
- __shared__ T tb_sum[TBSIZE];
237
-
238
- int i = blockDim .x * blockIdx .x + threadIdx .x ;
239
- const size_t local_i = threadIdx .x ;
240
-
241
- tb_sum[local_i] = {};
242
- for (; i < array_size; i += blockDim .x *gridDim .x )
243
- tb_sum[local_i] += a[i] * b[i];
227
+ __shared__ T smem[TBSIZE];
228
+ T tmp = T (0 .);
229
+ const size_t tidx = threadIdx .x ;
230
+ for (int i = tidx + (size_t )blockDim .x * blockIdx .x ; i < array_size; i += gridDim .x * blockDim .x ) {
231
+ tmp += a[i] * b[i];
232
+ }
233
+ smem[tidx] = tmp;
244
234
245
- for (int offset = blockDim .x / 2 ; offset > 0 ; offset /= 2 )
246
- {
235
+ for (int offset = blockDim .x / 2 ; offset > 0 ; offset /= 2 ) {
247
236
__syncthreads ();
248
- if (local_i < offset)
249
- {
250
- tb_sum[local_i] += tb_sum[local_i+offset];
251
- }
237
+ if (tidx < offset) smem[tidx] += smem[tidx+offset];
252
238
}
253
239
254
- if (local_i == 0 )
255
- sum[blockIdx .x ] = tb_sum[local_i];
240
+ if (tidx == 0 ) sums[blockIdx .x ] = smem[tidx];
256
241
}
257
242
258
243
template <class T >
259
244
T CUDAStream<T>::dot()
260
245
{
261
- dot_kernel<<<dot_num_blocks, TBSIZE, 0 , stream>>> (d_a, d_b, d_sum , array_size);
246
+ dot_kernel<<<dot_num_blocks, TBSIZE, 0 , stream>>> (d_a, d_b, sums , array_size);
262
247
CU (cudaPeekAtLastError ());
263
-
264
- #if !(defined(MANAGED) || defined(PAGEFAULT))
265
- CU (cudaMemcpyAsync (sums, d_sum, dot_num_blocks*sizeof (T), cudaMemcpyDeviceToHost, stream));
266
- #endif
267
248
CU (cudaStreamSynchronize (stream));
268
249
269
250
T sum = 0.0 ;
270
- for (int i = 0 ; i < dot_num_blocks; i++)
271
- {
272
- #if defined(MANAGED) || defined(PAGEFAULT)
273
- sum += d_sum[i];
274
- #else
275
- sum += sums[i];
276
- #endif
277
- }
251
+ for (int i = 0 ; i < dot_num_blocks; ++i) sum += sums[i];
278
252
279
253
return sum;
280
254
}
@@ -302,15 +276,13 @@ void listDevices(void)
302
276
}
303
277
}
304
278
305
-
306
279
std::string getDeviceName (const int device)
307
280
{
308
281
cudaDeviceProp props;
309
282
CU (cudaGetDeviceProperties (&props, device));
310
283
return std::string (props.name );
311
284
}
312
285
313
-
314
286
std::string getDeviceDriver (const int device)
315
287
{
316
288
CU (cudaSetDevice (device));
0 commit comments