Skip to content

Commit cac5047

Browse files
committed
Cleanup
1 parent 7149a55 commit cac5047

File tree

6 files changed

+205
-231
lines changed

6 files changed

+205
-231
lines changed

CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ endmacro()
3131
# the final executable name
3232
set(EXE_NAME babelstream)
3333

34-
# for chrono and some basic CXX features, models can overwrite this if required
35-
set(CMAKE_CXX_STANDARD 11)
34+
# for chrono, make_unique, and some basic CXX features, models can overwrite this if required
35+
set(CMAKE_CXX_STANDARD 14)
3636

3737
if (NOT CMAKE_BUILD_TYPE)
3838
message("No CMAKE_BUILD_TYPE specified, defaulting to 'Release'")

src/Stream.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ template <class T>
2020
class Stream
2121
{
2222
public:
23-
2423
virtual ~Stream(){}
2524

2625
// Kernels
@@ -35,10 +34,8 @@ class Stream
3534
// Copy memory between host and device
3635
virtual void init_arrays(T initA, T initB, T initC) = 0;
3736
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) = 0;
38-
3937
};
4038

41-
4239
// Implementation specific device functions
4340
void listDevices(void);
4441
std::string getDeviceName(const int);

src/StreamModels.h

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -36,66 +36,66 @@
3636
#endif
3737

3838
template <typename T>
39-
std::unique_ptr<Stream<T>> make_stream(int ARRAY_SIZE, unsigned int deviceIndex) {
39+
std::unique_ptr<Stream<T>> make_stream(int array_size, int deviceIndex) {
4040
#if defined(CUDA)
4141
// Use the CUDA implementation
42-
return std::make_unique<CUDAStream<T>>(ARRAY_SIZE, deviceIndex);
42+
return std::make_unique<CUDAStream<T>>(array_size, deviceIndex);
4343

4444
#elif defined(HIP)
4545
// Use the HIP implementation
46-
return std::make_unique<HIPStream<T>>(ARRAY_SIZE, deviceIndex);
46+
return std::make_unique<HIPStream<T>>(array_size, deviceIndex);
4747

4848
#elif defined(HC)
4949
// Use the HC implementation
50-
return std::make_unique<HCStream<T>>(ARRAY_SIZE, deviceIndex);
50+
return std::make_unique<HCStream<T>>(array_size, deviceIndex);
5151

5252
#elif defined(OCL)
5353
// Use the OpenCL implementation
54-
return std::make_unique<OCLStream<T>>(ARRAY_SIZE, deviceIndex);
54+
return std::make_unique<OCLStream<T>>(array_size, deviceIndex);
5555

5656
#elif defined(USE_RAJA)
5757
// Use the RAJA implementation
58-
return std::make_unique<RAJAStream<T>>(ARRAY_SIZE, deviceIndex);
58+
return std::make_unique<RAJAStream<T>>(array_size, deviceIndex);
5959

6060
#elif defined(KOKKOS)
6161
// Use the Kokkos implementation
62-
return std::make_unique<KokkosStream<T>>(ARRAY_SIZE, deviceIndex);
62+
return std::make_unique<KokkosStream<T>>(array_size, deviceIndex);
6363

6464
#elif defined(STD_DATA)
6565
// Use the C++ STD data-oriented implementation
66-
return std::make_unique<STDDataStream<T>>(ARRAY_SIZE, deviceIndex);
66+
return std::make_unique<STDDataStream<T>>(array_size, deviceIndex);
6767

6868
#elif defined(STD_INDICES)
6969
// Use the C++ STD index-oriented implementation
70-
return std::make_unique<STDIndicesStream<T>>(ARRAY_SIZE, deviceIndex);
70+
return std::make_unique<STDIndicesStream<T>>(array_size, deviceIndex);
7171

7272
#elif defined(STD_RANGES)
7373
// Use the C++ STD ranges implementation
74-
return std::make_unique<STDRangesStream<T>>(ARRAY_SIZE, deviceIndex);
74+
return std::make_unique<STDRangesStream<T>>(array_size, deviceIndex);
7575

7676
#elif defined(TBB)
7777
// Use the C++20 implementation
78-
return std::make_unique<TBBStream<T>>(ARRAY_SIZE, deviceIndex);
78+
return std::make_unique<TBBStream<T>>(array_size, deviceIndex);
7979

8080
#elif defined(THRUST)
8181
// Use the Thrust implementation
82-
return std::make_unique<ThrustStream<T>>(ARRAY_SIZE, deviceIndex);
82+
return std::make_unique<ThrustStream<T>>(array_size, deviceIndex);
8383

8484
#elif defined(ACC)
8585
// Use the OpenACC implementation
86-
return std::make_unique<ACCStream<T>>(ARRAY_SIZE, deviceIndex);
86+
return std::make_unique<ACCStream<T>>(array_size, deviceIndex);
8787

8888
#elif defined(SYCL) || defined(SYCL2020)
8989
// Use the SYCL implementation
90-
return std::make_unique<SYCLStream<T>>(ARRAY_SIZE, deviceIndex);
90+
return std::make_unique<SYCLStream<T>>(array_size, deviceIndex);
9191

9292
#elif defined(OMP)
9393
// Use the OpenMP implementation
94-
return std::make_unique<OMPStream<T>>(ARRAY_SIZE, deviceIndex);
94+
return std::make_unique<OMPStream<T>>(array_size, deviceIndex);
9595

9696
#elif defined(FUTHARK)
9797
// Use the Futhark implementation
98-
return std::make_unique<FutharkStream<T>>(ARRAY_SIZE, deviceIndex);
98+
return std::make_unique<FutharkStream<T>>(array_size, deviceIndex);
9999

100100
#else
101101

src/cuda/CUDAStream.cu

Lines changed: 32 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
1-
21
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
32
// University of Bristol HPC
43
//
54
// For full license terms please see the LICENSE file distributed with this
65
// source code
76

8-
97
#include "CUDAStream.h"
108

119
[[noreturn]] inline void error(char const* file, int line, char const* expr, cudaError_t e) {
@@ -17,12 +15,13 @@
1715
#define CU(EXPR) do { auto __e = (EXPR); if (__e != cudaSuccess) error(__FILE__, __LINE__, #EXPR, __e); } while(false)
1816

1917
// It is best practice to include __device__ and constexpr even though in BabelStream it only needs to be __host__ const
20-
__host__ __device__ constexpr size_t ceil_div(size_t a, size_t b) { return (a + b - 1)/b; }
18+
__host__ __device__ constexpr size_t ceil_div(size_t a, size_t b) { return (a + b - 1) / b; }
2119

2220
cudaStream_t stream;
2321

2422
template <class T>
25-
CUDAStream<T>::CUDAStream(const int ARRAY_SIZE, const int device_index)
23+
CUDAStream<T>::CUDAStream(const int array_size, const int device_index)
24+
: array_size(array_size)
2625
{
2726
// Set device
2827
int count;
@@ -43,20 +42,16 @@ CUDAStream<T>::CUDAStream(const int ARRAY_SIZE, const int device_index)
4342
#else
4443
std::cout << "Memory: DEFAULT" << std::endl;
4544
#endif
46-
array_size = ARRAY_SIZE;
47-
4845

4946
// Query device for sensible dot kernel block count
5047
cudaDeviceProp props;
5148
CU(cudaGetDeviceProperties(&props, device_index));
5249
dot_num_blocks = props.multiProcessorCount * 4;
5350

54-
// Allocate the host array for partial sums for dot kernels
55-
sums = (T*)malloc(sizeof(T) * dot_num_blocks);
56-
57-
size_t array_bytes = sizeof(T);
58-
array_bytes *= ARRAY_SIZE;
59-
size_t total_bytes = array_bytes * 4;
51+
// Size of partial sums for dot kernels
52+
size_t sums_bytes = sizeof(T) * dot_num_blocks;
53+
size_t array_bytes = sizeof(T) * array_size;
54+
size_t total_bytes = array_bytes * size_t(3) + sums_bytes;
6055
std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE << std::endl;
6156

6257
// Check buffers fit on the device
@@ -68,45 +63,42 @@ CUDAStream<T>::CUDAStream(const int ARRAY_SIZE, const int device_index)
6863
CU(cudaMallocManaged(&d_a, array_bytes));
6964
CU(cudaMallocManaged(&d_b, array_bytes));
7065
CU(cudaMallocManaged(&d_c, array_bytes));
71-
CU(cudaMallocManaged(&d_sum, dot_num_blocks*sizeof(T)));
66+
CU(cudaHostAlloc(&sums, sums_bytes, cudaHostAllocDefault));
7267
#elif defined(PAGEFAULT)
7368
d_a = (T*)malloc(array_bytes);
7469
d_b = (T*)malloc(array_bytes);
7570
d_c = (T*)malloc(array_bytes);
76-
d_sum = (T*)malloc(sizeof(T)*dot_num_blocks);
71+
sums = (T*)malloc(sums_bytes);
7772
#else
7873
CU(cudaMalloc(&d_a, array_bytes));
7974
CU(cudaMalloc(&d_b, array_bytes));
8075
CU(cudaMalloc(&d_c, array_bytes));
81-
CU(cudaMalloc(&d_sum, dot_num_blocks*sizeof(T)));
76+
CU(cudaHostAlloc(&sums, sums_bytes, cudaHostAllocDefault));
8277
#endif
8378
}
8479

85-
8680
template <class T>
8781
CUDAStream<T>::~CUDAStream()
8882
{
8983
CU(cudaStreamDestroy(stream));
90-
free(sums);
9184

9285
#if defined(PAGEFAULT)
9386
free(d_a);
9487
free(d_b);
9588
free(d_c);
96-
free(d_sum);
89+
free(sums);
9790
#else
9891
CU(cudaFree(d_a));
9992
CU(cudaFree(d_b));
10093
CU(cudaFree(d_c));
101-
CU(cudaFree(d_sum));
94+
CU(cudaFreeHost(sums));
10295
#endif
10396
}
10497

105-
10698
template <typename T>
10799
__global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC, int array_size)
108100
{
109-
for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
101+
for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
110102
a[i] = initA;
111103
b[i] = initB;
112104
c[i] = initC;
@@ -128,7 +120,7 @@ void CUDAStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vecto
128120
// Copy device memory to host
129121
#if defined(PAGEFAULT) || defined(MANAGED)
130122
CU(cudaStreamSynchronize(stream));
131-
for (int i = 0; i < array_size; i++)
123+
for (int i = 0; i < array_size; ++i)
132124
{
133125
a[i] = d_a[i];
134126
b[i] = d_b[i];
@@ -141,11 +133,10 @@ void CUDAStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vecto
141133
#endif
142134
}
143135

144-
145136
template <typename T>
146137
__global__ void copy_kernel(const T * a, T * c, int array_size)
147138
{
148-
for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
139+
for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
149140
c[i] = a[i];
150141
}
151142
}
@@ -163,7 +154,7 @@ template <typename T>
163154
__global__ void mul_kernel(T * b, const T * c, int array_size)
164155
{
165156
const T scalar = startScalar;
166-
for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
157+
for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
167158
b[i] = scalar * c[i];
168159
}
169160
}
@@ -180,7 +171,7 @@ void CUDAStream<T>::mul()
180171
template <typename T>
181172
__global__ void add_kernel(const T * a, const T * b, T * c, int array_size)
182173
{
183-
for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
174+
for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
184175
c[i] = a[i] + b[i];
185176
}
186177
}
@@ -198,7 +189,7 @@ template <typename T>
198189
__global__ void triad_kernel(T * a, const T * b, const T * c, int array_size)
199190
{
200191
const T scalar = startScalar;
201-
for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
192+
for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
202193
a[i] = b[i] + scalar * c[i];
203194
}
204195
}
@@ -216,7 +207,7 @@ template <typename T>
216207
__global__ void nstream_kernel(T * a, const T * b, const T * c, int array_size)
217208
{
218209
const T scalar = startScalar;
219-
for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
210+
for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
220211
a[i] += b[i] + scalar * c[i];
221212
}
222213
}
@@ -231,50 +222,33 @@ void CUDAStream<T>::nstream()
231222
}
232223

233224
template <class T>
234-
__global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size)
225+
__global__ void dot_kernel(const T * a, const T * b, T* sums, int array_size)
235226
{
236-
__shared__ T tb_sum[TBSIZE];
237-
238-
int i = blockDim.x * blockIdx.x + threadIdx.x;
239-
const size_t local_i = threadIdx.x;
240-
241-
tb_sum[local_i] = {};
242-
for (; i < array_size; i += blockDim.x*gridDim.x)
243-
tb_sum[local_i] += a[i] * b[i];
227+
__shared__ T smem[TBSIZE];
228+
T tmp = T(0.);
229+
const size_t tidx = threadIdx.x;
230+
for (int i = tidx + (size_t)blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
231+
tmp += a[i] * b[i];
232+
}
233+
smem[tidx] = tmp;
244234

245-
for (int offset = blockDim.x / 2; offset > 0; offset /= 2)
246-
{
235+
for (int offset = blockDim.x / 2; offset > 0; offset /= 2) {
247236
__syncthreads();
248-
if (local_i < offset)
249-
{
250-
tb_sum[local_i] += tb_sum[local_i+offset];
251-
}
237+
if (tidx < offset) smem[tidx] += smem[tidx+offset];
252238
}
253239

254-
if (local_i == 0)
255-
sum[blockIdx.x] = tb_sum[local_i];
240+
if (tidx == 0) sums[blockIdx.x] = smem[tidx];
256241
}
257242

258243
template <class T>
259244
T CUDAStream<T>::dot()
260245
{
261-
dot_kernel<<<dot_num_blocks, TBSIZE, 0, stream>>>(d_a, d_b, d_sum, array_size);
246+
dot_kernel<<<dot_num_blocks, TBSIZE, 0, stream>>>(d_a, d_b, sums, array_size);
262247
CU(cudaPeekAtLastError());
263-
264-
#if !(defined(MANAGED) || defined(PAGEFAULT))
265-
CU(cudaMemcpyAsync(sums, d_sum, dot_num_blocks*sizeof(T), cudaMemcpyDeviceToHost, stream));
266-
#endif
267248
CU(cudaStreamSynchronize(stream));
268249

269250
T sum = 0.0;
270-
for (int i = 0; i < dot_num_blocks; i++)
271-
{
272-
#if defined(MANAGED) || defined(PAGEFAULT)
273-
sum += d_sum[i];
274-
#else
275-
sum += sums[i];
276-
#endif
277-
}
251+
for (int i = 0; i < dot_num_blocks; ++i) sum += sums[i];
278252

279253
return sum;
280254
}
@@ -302,15 +276,13 @@ void listDevices(void)
302276
}
303277
}
304278

305-
306279
std::string getDeviceName(const int device)
307280
{
308281
cudaDeviceProp props;
309282
CU(cudaGetDeviceProperties(&props, device));
310283
return std::string(props.name);
311284
}
312285

313-
314286
std::string getDeviceDriver(const int device)
315287
{
316288
CU(cudaSetDevice(device));

src/cuda/CUDAStream.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,11 @@ class CUDAStream : public Stream<T>
3131
T *d_a;
3232
T *d_b;
3333
T *d_c;
34-
T *d_sum;
3534

3635
// Number of blocks for dot kernel
3736
int dot_num_blocks;
3837

3938
public:
40-
4139
CUDAStream(const int, const int);
4240
~CUDAStream();
4341

@@ -50,5 +48,4 @@ class CUDAStream : public Stream<T>
5048

5149
virtual void init_arrays(T initA, T initB, T initC) override;
5250
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
53-
5451
};

0 commit comments

Comments
 (0)