Cleanup

gonzalobg · gonzalobg · commit 02e06d6fbe04 · 2024-03-10T16:58:09.000-07:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -31,8 +31,8 @@ endmacro()
 # the final executable name
 set(EXE_NAME babelstream)
 
-# for chrono and some basic CXX features, models can overwrite this if required
-set(CMAKE_CXX_STANDARD 11)
+# for chrono, make_unique, and some basic CXX features, models can overwrite this if required
+set(CMAKE_CXX_STANDARD 14)
 
 if (NOT CMAKE_BUILD_TYPE)
     message("No CMAKE_BUILD_TYPE specified, defaulting to 'Release'")
diff --git a/src/Stream.h b/src/Stream.h
@@ -20,7 +20,6 @@ template <class T>
 class Stream
 {
   public:
-
     virtual ~Stream(){}
 
     // Kernels
@@ -35,10 +34,8 @@ class Stream
     // Copy memory between host and device
     virtual void init_arrays(T initA, T initB, T initC) = 0;
     virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) = 0;
-
 };
 
-
 // Implementation specific device functions
 void listDevices(void);
 std::string getDeviceName(const int);
diff --git a/src/StreamModels.h b/src/StreamModels.h
@@ -36,66 +36,66 @@
 #endif
 
 template <typename T>
-std::unique_ptr<Stream<T>> make_stream(int ARRAY_SIZE, unsigned int deviceIndex) {
+std::unique_ptr<Stream<T>> make_stream(int array_size, int deviceIndex) {
 #if defined(CUDA)
   // Use the CUDA implementation
-  return std::make_unique<CUDAStream<T>>(ARRAY_SIZE, deviceIndex);
+  return std::make_unique<CUDAStream<T>>(array_size, deviceIndex);
 
 #elif defined(HIP)
   // Use the HIP implementation
-  return std::make_unique<HIPStream<T>>(ARRAY_SIZE, deviceIndex);
+  return std::make_unique<HIPStream<T>>(array_size, deviceIndex);
 
 #elif defined(HC)
   // Use the HC implementation
-  return std::make_unique<HCStream<T>>(ARRAY_SIZE, deviceIndex);
+  return std::make_unique<HCStream<T>>(array_size, deviceIndex);
 
 #elif defined(OCL)
   // Use the OpenCL implementation
-  return std::make_unique<OCLStream<T>>(ARRAY_SIZE, deviceIndex);
+  return std::make_unique<OCLStream<T>>(array_size, deviceIndex);
 
 #elif defined(USE_RAJA)
   // Use the RAJA implementation
-  return std::make_unique<RAJAStream<T>>(ARRAY_SIZE, deviceIndex);
+  return std::make_unique<RAJAStream<T>>(array_size, deviceIndex);
 
 #elif defined(KOKKOS)
   // Use the Kokkos implementation
-  return std::make_unique<KokkosStream<T>>(ARRAY_SIZE, deviceIndex);
+  return std::make_unique<KokkosStream<T>>(array_size, deviceIndex);
 
 #elif defined(STD_DATA)
   // Use the C++ STD data-oriented implementation
-  return std::make_unique<STDDataStream<T>>(ARRAY_SIZE, deviceIndex);
+  return std::make_unique<STDDataStream<T>>(array_size, deviceIndex);
 
 #elif defined(STD_INDICES)
   // Use the C++ STD index-oriented implementation
-  return std::make_unique<STDIndicesStream<T>>(ARRAY_SIZE, deviceIndex);
+  return std::make_unique<STDIndicesStream<T>>(array_size, deviceIndex);
 
 #elif defined(STD_RANGES)
   // Use the C++ STD ranges implementation
-  return std::make_unique<STDRangesStream<T>>(ARRAY_SIZE, deviceIndex);
+  return std::make_unique<STDRangesStream<T>>(array_size, deviceIndex);
 
 #elif defined(TBB)
   // Use the C++20 implementation
-  return std::make_unique<TBBStream<T>>(ARRAY_SIZE, deviceIndex);
+  return std::make_unique<TBBStream<T>>(array_size, deviceIndex);
 
 #elif defined(THRUST)
   // Use the Thrust implementation
-  return std::make_unique<ThrustStream<T>>(ARRAY_SIZE, deviceIndex);
+  return std::make_unique<ThrustStream<T>>(array_size, deviceIndex);
 
 #elif defined(ACC)
   // Use the OpenACC implementation
-  return std::make_unique<ACCStream<T>>(ARRAY_SIZE, deviceIndex);
+  return std::make_unique<ACCStream<T>>(array_size, deviceIndex);
 
 #elif defined(SYCL) || defined(SYCL2020)
   // Use the SYCL implementation
-  return std::make_unique<SYCLStream<T>>(ARRAY_SIZE, deviceIndex);
+  return std::make_unique<SYCLStream<T>>(array_size, deviceIndex);
 
 #elif defined(OMP)
   // Use the OpenMP implementation
-  return std::make_unique<OMPStream<T>>(ARRAY_SIZE, deviceIndex);
+  return std::make_unique<OMPStream<T>>(array_size, deviceIndex);
 
 #elif defined(FUTHARK)
   // Use the Futhark implementation
-  return std::make_unique<FutharkStream<T>>(ARRAY_SIZE, deviceIndex);
+  return std::make_unique<FutharkStream<T>>(array_size, deviceIndex);
 
 #else
 
diff --git a/src/cuda/CUDAStream.cu b/src/cuda/CUDAStream.cu
@@ -1,11 +1,9 @@
-
 // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
 // University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
 
-
 #include "CUDAStream.h"
 
 [[noreturn]] inline void error(char const* file, int line, char const* expr, cudaError_t e) {
@@ -17,12 +15,13 @@
 #define CU(EXPR) do { auto __e = (EXPR); if (__e != cudaSuccess) error(__FILE__, __LINE__, #EXPR, __e); } while(false)
 
 // It is best practice to include __device__ and constexpr even though in BabelStream it only needs to be __host__ const
-__host__ __device__ constexpr size_t ceil_div(size_t a, size_t b) { return (a + b - 1)/b; }
+__host__ __device__ constexpr size_t ceil_div(size_t a, size_t b) { return (a + b - 1) / b; }
 
 cudaStream_t stream;
 
 template <class T>
-CUDAStream<T>::CUDAStream(const int ARRAY_SIZE, const int device_index)
+CUDAStream<T>::CUDAStream(const int array_size, const int device_index)
+  : array_size(array_size)
 {
   // Set device
   int count;
@@ -43,20 +42,16 @@ CUDAStream<T>::CUDAStream(const int ARRAY_SIZE, const int device_index)
 #else
   std::cout << "Memory: DEFAULT" << std::endl;
 #endif
-  array_size = ARRAY_SIZE;
-
 
   // Query device for sensible dot kernel block count
   cudaDeviceProp props;
   CU(cudaGetDeviceProperties(&props, device_index));
   dot_num_blocks = props.multiProcessorCount * 4;
 
-  // Allocate the host array for partial sums for dot kernels
-  sums = (T*)malloc(sizeof(T) * dot_num_blocks);
-
-  size_t array_bytes = sizeof(T);
-  array_bytes *= ARRAY_SIZE;
-  size_t total_bytes = array_bytes * 4;
+  // Size of partial sums for dot kernels
+  size_t sums_bytes = sizeof(T) * dot_num_blocks;
+  size_t array_bytes = sizeof(T) * array_size;
+  size_t total_bytes = array_bytes * size_t(3) + sums_bytes;
   std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE << std::endl;
 
   // Check buffers fit on the device
@@ -68,45 +63,42 @@ CUDAStream<T>::CUDAStream(const int ARRAY_SIZE, const int device_index)
   CU(cudaMallocManaged(&d_a, array_bytes));
   CU(cudaMallocManaged(&d_b, array_bytes));
   CU(cudaMallocManaged(&d_c, array_bytes));
-  CU(cudaMallocManaged(&d_sum, dot_num_blocks*sizeof(T)));
+  CU(cudaHostAlloc(&sums, sums_bytes, cudaHostAllocDefault));
 #elif defined(PAGEFAULT)
   d_a = (T*)malloc(array_bytes);
   d_b = (T*)malloc(array_bytes);
   d_c = (T*)malloc(array_bytes);
-  d_sum = (T*)malloc(sizeof(T)*dot_num_blocks);
+  sums = (T*)malloc(sums_bytes);
 #else
   CU(cudaMalloc(&d_a, array_bytes));
   CU(cudaMalloc(&d_b, array_bytes));
   CU(cudaMalloc(&d_c, array_bytes));
-  CU(cudaMalloc(&d_sum, dot_num_blocks*sizeof(T)));
+  CU(cudaHostAlloc(&sums, sums_bytes, cudaHostAllocDefault));
 #endif
 }
 
-
 template <class T>
 CUDAStream<T>::~CUDAStream()
 {
   CU(cudaStreamDestroy(stream));
-  free(sums);
 
 #if defined(PAGEFAULT)
   free(d_a);
   free(d_b);
   free(d_c);
-  free(d_sum);
+  free(sums);
 #else
   CU(cudaFree(d_a));
   CU(cudaFree(d_b));
   CU(cudaFree(d_c));
-  CU(cudaFree(d_sum));
+  CU(cudaFreeHost(sums));
 #endif
 }
 
-
 template <typename T>
 __global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC, int array_size)
 {  
-  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
+  for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
     a[i] = initA;
     b[i] = initB;
     c[i] = initC;
@@ -128,7 +120,7 @@ void CUDAStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vecto
   // Copy device memory to host
 #if defined(PAGEFAULT) || defined(MANAGED)
   CU(cudaStreamSynchronize(stream));
-  for (int i = 0; i < array_size; i++)
+  for (int i = 0; i < array_size; ++i)
   {
     a[i] = d_a[i];
     b[i] = d_b[i];
@@ -141,11 +133,10 @@ void CUDAStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vecto
 #endif
 }
 
-
 template <typename T>
 __global__ void copy_kernel(const T * a, T * c, int array_size)
 {
-  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
+  for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
     c[i] = a[i];
   }
 }
@@ -163,7 +154,7 @@ template <typename T>
 __global__ void mul_kernel(T * b, const T * c, int array_size)
 {
   const T scalar = startScalar;
-  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
+  for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
     b[i] = scalar * c[i];
   }
 }
@@ -180,7 +171,7 @@ void CUDAStream<T>::mul()
 template <typename T>
 __global__ void add_kernel(const T * a, const T * b, T * c, int array_size)
 {
-  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
+  for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
     c[i] = a[i] + b[i];
   }
 }
@@ -198,7 +189,7 @@ template <typename T>
 __global__ void triad_kernel(T * a, const T * b, const T * c, int array_size)
 {
   const T scalar = startScalar;
-  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
+  for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
     a[i] = b[i] + scalar * c[i];
   }
 }
@@ -216,7 +207,7 @@ template <typename T>
 __global__ void nstream_kernel(T * a, const T * b, const T * c, int array_size)
 {
   const T scalar = startScalar;
-  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
+  for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
     a[i] += b[i] + scalar * c[i];
   }
 }
@@ -231,50 +222,33 @@ void CUDAStream<T>::nstream()
 }
 
 template <class T>
-__global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size)
+__global__ void dot_kernel(const T * a, const T * b, T* sums, int array_size)
 {
-  __shared__ T tb_sum[TBSIZE];
-
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
-  const size_t local_i = threadIdx.x;
-
-  tb_sum[local_i] = {};
-  for (; i < array_size; i += blockDim.x*gridDim.x)
-    tb_sum[local_i] += a[i] * b[i];
+  __shared__ T smem[TBSIZE];
+  T tmp = T(0.);
+  const size_t tidx = threadIdx.x;
+  for (int i = tidx + (size_t)blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
+    tmp += a[i] * b[i];
+  }
+  smem[tidx] = tmp;
 
-  for (int offset = blockDim.x / 2; offset > 0; offset /= 2)
-  {
+  for (int offset = blockDim.x / 2; offset > 0; offset /= 2) {
     __syncthreads();
-    if (local_i < offset)
-    {
-      tb_sum[local_i] += tb_sum[local_i+offset];
-    }
+    if (tidx < offset) smem[tidx] += smem[tidx+offset];
   }
 
-  if (local_i == 0)
-    sum[blockIdx.x] = tb_sum[local_i];
+  if (tidx == 0) sums[blockIdx.x] = smem[tidx];
 }
 
 template <class T>
 T CUDAStream<T>::dot()
 {
-  dot_kernel<<<dot_num_blocks, TBSIZE, 0, stream>>>(d_a, d_b, d_sum, array_size);
+  dot_kernel<<<dot_num_blocks, TBSIZE, 0, stream>>>(d_a, d_b, sums, array_size);
   CU(cudaPeekAtLastError());
-
-#if !(defined(MANAGED) || defined(PAGEFAULT))
-  CU(cudaMemcpyAsync(sums, d_sum, dot_num_blocks*sizeof(T), cudaMemcpyDeviceToHost, stream));
-#endif
   CU(cudaStreamSynchronize(stream));
 
   T sum = 0.0;
-  for (int i = 0; i < dot_num_blocks; i++)
-  {
-#if defined(MANAGED) || defined(PAGEFAULT)
-    sum += d_sum[i];
-#else
-    sum += sums[i];
-#endif
-  }
+  for (int i = 0; i < dot_num_blocks; ++i) sum += sums[i];
 
   return sum;
 }
@@ -302,15 +276,13 @@ void listDevices(void)
   }
 }
 
-
 std::string getDeviceName(const int device)
 {
   cudaDeviceProp props;
   CU(cudaGetDeviceProperties(&props, device));
   return std::string(props.name);
 }
 
-
 std::string getDeviceDriver(const int device)
 {
   CU(cudaSetDevice(device));
diff --git a/src/cuda/CUDAStream.h b/src/cuda/CUDAStream.h
@@ -31,13 +31,11 @@ class CUDAStream : public Stream<T>
     T *d_a;
     T *d_b;
     T *d_c;
-    T *d_sum;
 
     // Number of blocks for dot kernel
     int dot_num_blocks;
 
   public:
-
     CUDAStream(const int, const int);
     ~CUDAStream();
 
@@ -50,5 +48,4 @@ class CUDAStream : public Stream<T>
 
     virtual void init_arrays(T initA, T initB, T initC) override;
     virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
-
 };
diff --git a/src/main.cpp b/src/main.cpp