diff --git a/CMakeLists.txt b/CMakeLists.txt index 27736b6b..c98cab48 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,8 +31,8 @@ endmacro() # the final executable name set(EXE_NAME babelstream) -# for chrono and some basic CXX features, models can overwrite this if required -set(CMAKE_CXX_STANDARD 11) +# for chrono, make_unique, and some basic CXX features, models can overwrite this if required +set(CMAKE_CXX_STANDARD 14) if (NOT CMAKE_BUILD_TYPE) message("No CMAKE_BUILD_TYPE specified, defaulting to 'Release'") diff --git a/src/Stream.h b/src/Stream.h index eb4ffd4f..45c144c3 100644 --- a/src/Stream.h +++ b/src/Stream.h @@ -20,7 +20,6 @@ template class Stream { public: - virtual ~Stream(){} // Kernels @@ -35,10 +34,8 @@ class Stream // Copy memory between host and device virtual void init_arrays(T initA, T initB, T initC) = 0; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) = 0; - }; - // Implementation specific device functions void listDevices(void); std::string getDeviceName(const int); diff --git a/src/StreamModels.h b/src/StreamModels.h new file mode 100644 index 00000000..0a4649b9 --- /dev/null +++ b/src/StreamModels.h @@ -0,0 +1,105 @@ +#pragma once +#include + +#if defined(CUDA) +#include "CUDAStream.h" +#elif defined(STD_DATA) +#include "STDDataStream.h" +#elif defined(STD_INDICES) +#include "STDIndicesStream.h" +#elif defined(STD_RANGES) +#include "STDRangesStream.hpp" +#elif defined(TBB) +#include "TBBStream.hpp" +#elif defined(THRUST) +#include "ThrustStream.h" +#elif defined(HIP) +#include "HIPStream.h" +#elif defined(HC) +#include "HCStream.h" +#elif defined(OCL) +#include "OCLStream.h" +#elif defined(USE_RAJA) +#include "RAJAStream.hpp" +#elif defined(KOKKOS) +#include "KokkosStream.hpp" +#elif defined(ACC) +#include "ACCStream.h" +#elif defined(SYCL) +#include "SYCLStream.h" +#elif defined(SYCL2020) +#include "SYCLStream2020.h" +#elif defined(OMP) +#include "OMPStream.h" +#elif defined(FUTHARK) +#include "FutharkStream.h" +#endif + +template +std::unique_ptr> make_stream(int array_size, int deviceIndex) { +#if defined(CUDA) + // Use the CUDA implementation + return std::make_unique>(array_size, deviceIndex); + +#elif defined(HIP) + // Use the HIP implementation + return std::make_unique>(array_size, deviceIndex); + +#elif defined(HC) + // Use the HC implementation + return std::make_unique>(array_size, deviceIndex); + +#elif defined(OCL) + // Use the OpenCL implementation + return std::make_unique>(array_size, deviceIndex); + +#elif defined(USE_RAJA) + // Use the RAJA implementation + return std::make_unique>(array_size, deviceIndex); + +#elif defined(KOKKOS) + // Use the Kokkos implementation + return std::make_unique>(array_size, deviceIndex); + +#elif defined(STD_DATA) + // Use the C++ STD data-oriented implementation + return std::make_unique>(array_size, deviceIndex); + +#elif defined(STD_INDICES) + // Use the C++ STD index-oriented implementation + return std::make_unique>(array_size, deviceIndex); + +#elif defined(STD_RANGES) + // Use the C++ STD ranges implementation + return std::make_unique>(array_size, deviceIndex); + +#elif defined(TBB) + // Use the C++20 implementation + return std::make_unique>(array_size, deviceIndex); + +#elif defined(THRUST) + // Use the Thrust implementation + return std::make_unique>(array_size, deviceIndex); + +#elif defined(ACC) + // Use the OpenACC implementation + return std::make_unique>(array_size, deviceIndex); + +#elif defined(SYCL) || defined(SYCL2020) + // Use the SYCL implementation + return std::make_unique>(array_size, deviceIndex); + +#elif defined(OMP) + // Use the OpenMP implementation + return std::make_unique>(array_size, deviceIndex); + +#elif defined(FUTHARK) + // Use the Futhark implementation + return std::make_unique>(array_size, deviceIndex); + +#else + + #error unknown benchmark + +#endif +} diff --git a/src/Unit.h b/src/Unit.h new file mode 100644 index 00000000..2efde5d6 --- /dev/null +++ b/src/Unit.h @@ -0,0 +1,35 @@ +#pragma once +#include + +// Units for output: +struct Unit { + enum class Kind { MegaByte, GigaByte, TeraByte, MibiByte, GibiByte, TebiByte }; + + Kind value; + + explicit Unit(Kind v) : value(v) {} + + double fmt(double bytes) const { + switch(value) { + case Kind::MibiByte: return std::pow(2.0, -20.0) * bytes; + case Kind::MegaByte: return 1.0E-6 * bytes; + case Kind::GibiByte: return std::pow(2.0, -30.0) * bytes; + case Kind::GigaByte: return 1.0E-9 * bytes; + case Kind::TebiByte: return std::pow(2.0, -40.0) * bytes; + case Kind::TeraByte: return 1.0E-12 * bytes; + default: std::cerr << "Unimplemented!" << std::endl; std::abort(); + } + } + + char const* str() const { + switch(value) { + case Kind::MibiByte: return "MiB"; + case Kind::MegaByte: return "MB"; + case Kind::GibiByte: return "GiB"; + case Kind::GigaByte: return "GB"; + case Kind::TebiByte: return "TiB"; + case Kind::TeraByte: return "TB"; + default: std::cerr << "Unimplemented!" << std::endl; std::abort(); + } + } +}; diff --git a/src/cuda/CUDAStream.cu b/src/cuda/CUDAStream.cu index cff1951a..a6361139 100644 --- a/src/cuda/CUDAStream.cu +++ b/src/cuda/CUDAStream.cu @@ -1,11 +1,9 @@ - // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, // University of Bristol HPC // // For full license terms please see the LICENSE file distributed with this // source code - #include "CUDAStream.h" [[noreturn]] inline void error(char const* file, int line, char const* expr, cudaError_t e) { @@ -17,12 +15,13 @@ #define CU(EXPR) do { auto __e = (EXPR); if (__e != cudaSuccess) error(__FILE__, __LINE__, #EXPR, __e); } while(false) // It is best practice to include __device__ and constexpr even though in BabelStream it only needs to be __host__ const -__host__ __device__ constexpr size_t ceil_div(size_t a, size_t b) { return (a + b - 1)/b; } +__host__ __device__ constexpr size_t ceil_div(size_t a, size_t b) { return (a + b - 1) / b; } cudaStream_t stream; template -CUDAStream::CUDAStream(const int ARRAY_SIZE, const int device_index) +CUDAStream::CUDAStream(const int array_size, const int device_index) + : array_size(array_size) { // Set device int count; @@ -43,20 +42,16 @@ CUDAStream::CUDAStream(const int ARRAY_SIZE, const int device_index) #else std::cout << "Memory: DEFAULT" << std::endl; #endif - array_size = ARRAY_SIZE; - // Query device for sensible dot kernel block count cudaDeviceProp props; CU(cudaGetDeviceProperties(&props, device_index)); dot_num_blocks = props.multiProcessorCount * 4; - // Allocate the host array for partial sums for dot kernels - sums = (T*)malloc(sizeof(T) * dot_num_blocks); - - size_t array_bytes = sizeof(T); - array_bytes *= ARRAY_SIZE; - size_t total_bytes = array_bytes * 4; + // Size of partial sums for dot kernels + size_t sums_bytes = sizeof(T) * dot_num_blocks; + size_t array_bytes = sizeof(T) * array_size; + size_t total_bytes = array_bytes * size_t(3) + sums_bytes; std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE << std::endl; // Check buffers fit on the device @@ -68,45 +63,42 @@ CUDAStream::CUDAStream(const int ARRAY_SIZE, const int device_index) CU(cudaMallocManaged(&d_a, array_bytes)); CU(cudaMallocManaged(&d_b, array_bytes)); CU(cudaMallocManaged(&d_c, array_bytes)); - CU(cudaMallocManaged(&d_sum, dot_num_blocks*sizeof(T))); + CU(cudaHostAlloc(&sums, sums_bytes, cudaHostAllocDefault)); #elif defined(PAGEFAULT) d_a = (T*)malloc(array_bytes); d_b = (T*)malloc(array_bytes); d_c = (T*)malloc(array_bytes); - d_sum = (T*)malloc(sizeof(T)*dot_num_blocks); + sums = (T*)malloc(sums_bytes); #else CU(cudaMalloc(&d_a, array_bytes)); CU(cudaMalloc(&d_b, array_bytes)); CU(cudaMalloc(&d_c, array_bytes)); - CU(cudaMalloc(&d_sum, dot_num_blocks*sizeof(T))); + CU(cudaHostAlloc(&sums, sums_bytes, cudaHostAllocDefault)); #endif } - template CUDAStream::~CUDAStream() { CU(cudaStreamDestroy(stream)); - free(sums); #if defined(PAGEFAULT) free(d_a); free(d_b); free(d_c); - free(d_sum); + free(sums); #else CU(cudaFree(d_a)); CU(cudaFree(d_b)); CU(cudaFree(d_c)); - CU(cudaFree(d_sum)); + CU(cudaFreeHost(sums)); #endif } - template __global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC, int array_size) { - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < array_size; i += gridDim.x * blockDim.x) { + for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) { a[i] = initA; b[i] = initB; c[i] = initC; @@ -128,7 +120,7 @@ void CUDAStream::read_arrays(std::vector& a, std::vector& b, std::vecto // Copy device memory to host #if defined(PAGEFAULT) || defined(MANAGED) CU(cudaStreamSynchronize(stream)); - for (int i = 0; i < array_size; i++) + for (int i = 0; i < array_size; ++i) { a[i] = d_a[i]; b[i] = d_b[i]; @@ -141,11 +133,10 @@ void CUDAStream::read_arrays(std::vector& a, std::vector& b, std::vecto #endif } - template __global__ void copy_kernel(const T * a, T * c, int array_size) { - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < array_size; i += gridDim.x * blockDim.x) { + for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) { c[i] = a[i]; } } @@ -163,7 +154,7 @@ template __global__ void mul_kernel(T * b, const T * c, int array_size) { const T scalar = startScalar; - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < array_size; i += gridDim.x * blockDim.x) { + for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) { b[i] = scalar * c[i]; } } @@ -180,7 +171,7 @@ void CUDAStream::mul() template __global__ void add_kernel(const T * a, const T * b, T * c, int array_size) { - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < array_size; i += gridDim.x * blockDim.x) { + for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) { c[i] = a[i] + b[i]; } } @@ -198,7 +189,7 @@ template __global__ void triad_kernel(T * a, const T * b, const T * c, int array_size) { const T scalar = startScalar; - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < array_size; i += gridDim.x * blockDim.x) { + for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) { a[i] = b[i] + scalar * c[i]; } } @@ -216,7 +207,7 @@ template __global__ void nstream_kernel(T * a, const T * b, const T * c, int array_size) { const T scalar = startScalar; - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < array_size; i += gridDim.x * blockDim.x) { + for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) { a[i] += b[i] + scalar * c[i]; } } @@ -231,50 +222,34 @@ void CUDAStream::nstream() } template -__global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) +__global__ void dot_kernel(const T * a, const T * b, T* sums, int array_size) { - __shared__ T tb_sum[TBSIZE]; - - int i = blockDim.x * blockIdx.x + threadIdx.x; - const size_t local_i = threadIdx.x; - - tb_sum[local_i] = {}; - for (; i < array_size; i += blockDim.x*gridDim.x) - tb_sum[local_i] += a[i] * b[i]; + __shared__ T smem[TBSIZE]; + T tmp = T(0.); + const size_t tidx = threadIdx.x; + for (int i = tidx + (size_t)blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) { + tmp += a[i] * b[i]; + } + smem[tidx] = tmp; - for (int offset = blockDim.x / 2; offset > 0; offset /= 2) - { + for (int offset = blockDim.x / 2; offset > 0; offset /= 2) { __syncthreads(); - if (local_i < offset) - { - tb_sum[local_i] += tb_sum[local_i+offset]; - } + if (tidx < offset) smem[tidx] += smem[tidx+offset]; } - if (local_i == 0) - sum[blockIdx.x] = tb_sum[local_i]; + // First thread writes to host memory directly from the device + if (tidx == 0) sums[blockIdx.x] = smem[tidx]; } template T CUDAStream::dot() { - dot_kernel<<>>(d_a, d_b, d_sum, array_size); + dot_kernel<<>>(d_a, d_b, sums, array_size); CU(cudaPeekAtLastError()); - -#if !(defined(MANAGED) || defined(PAGEFAULT)) - CU(cudaMemcpyAsync(sums, d_sum, dot_num_blocks*sizeof(T), cudaMemcpyDeviceToHost, stream)); -#endif CU(cudaStreamSynchronize(stream)); T sum = 0.0; - for (int i = 0; i < dot_num_blocks; i++) - { -#if defined(MANAGED) || defined(PAGEFAULT) - sum += d_sum[i]; -#else - sum += sums[i]; -#endif - } + for (int i = 0; i < dot_num_blocks; ++i) sum += sums[i]; return sum; } @@ -302,7 +277,6 @@ void listDevices(void) } } - std::string getDeviceName(const int device) { cudaDeviceProp props; @@ -310,7 +284,6 @@ std::string getDeviceName(const int device) return std::string(props.name); } - std::string getDeviceDriver(const int device) { CU(cudaSetDevice(device)); diff --git a/src/cuda/CUDAStream.h b/src/cuda/CUDAStream.h index d16511fe..54bf2a18 100644 --- a/src/cuda/CUDAStream.h +++ b/src/cuda/CUDAStream.h @@ -31,13 +31,11 @@ class CUDAStream : public Stream T *d_a; T *d_b; T *d_c; - T *d_sum; // Number of blocks for dot kernel int dot_num_blocks; public: - CUDAStream(const int, const int); ~CUDAStream(); @@ -50,5 +48,4 @@ class CUDAStream : public Stream virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; - }; diff --git a/src/main.cpp b/src/main.cpp index abfc14e4..877127af 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,87 +1,87 @@ - // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, // University of Bristol HPC // // For full license terms please see the LICENSE file distributed with this // source code -#include -#include -#include -#include -#include -#include #include -#include +#include +#include +#include #include +#include +#include +#include +#include +#include #define VERSION_STRING "5.0" #include "Stream.h" - -#if defined(CUDA) -#include "CUDAStream.h" -#elif defined(STD_DATA) -#include "STDDataStream.h" -#elif defined(STD_INDICES) -#include "STDIndicesStream.h" -#elif defined(STD_RANGES) -#include "STDRangesStream.hpp" -#elif defined(TBB) -#include "TBBStream.hpp" -#elif defined(THRUST) -#include "ThrustStream.h" -#elif defined(HIP) -#include "HIPStream.h" -#elif defined(HC) -#include "HCStream.h" -#elif defined(OCL) -#include "OCLStream.h" -#elif defined(USE_RAJA) -#include "RAJAStream.hpp" -#elif defined(KOKKOS) -#include "KokkosStream.hpp" -#elif defined(ACC) -#include "ACCStream.h" -#elif defined(SYCL) -#include "SYCLStream.h" -#elif defined(SYCL2020) -#include "SYCLStream2020.h" -#elif defined(OMP) -#include "OMPStream.h" -#elif defined(FUTHARK) -#include "FutharkStream.h" -#endif +#include "StreamModels.h" +#include "Unit.h" // Default size of 2^25 int ARRAY_SIZE = 33554432; -unsigned int num_times = 100; -unsigned int deviceIndex = 0; +size_t num_times = 100; +size_t deviceIndex = 0; bool use_float = false; bool output_as_csv = false; -bool mibibytes = false; +// Default unit of memory is MegaBytes (as per STREAM) +Unit unit{Unit::Kind::MegaByte}; +bool silence_errors = false; std::string csv_separator = ","; -template -void check_solution(const unsigned int ntimes, std::vector& a, std::vector& b, std::vector& c, T& sum); +// Benchmark Identifier: identifies individual & groups of benchmarks: +// - Classic: 5 classic kernels: Copy, Mul, Add, Triad, Dot. +// - All: all kernels. +// - Individual kernels only. +enum class BenchId : int {Copy, Mul, Add, Triad, Nstream, Dot, Classic, All}; + +struct Benchmark { + BenchId id; + char const* label; + // Weight counts data elements of original arrays moved each loop iteration - used to calculate achieved BW: + // bytes = weight * sizeof(T) * ARRAY_SIZE -> bw = bytes / dur + size_t weight; + // Is it one of: Copy, Mul, Add, Triad, Dot? + bool classic = false; +}; + +// Benchmarks in the order in which - if present - should be run for validation purposes: +constexpr size_t num_benchmarks = 6; +std::array bench = { + Benchmark { .id = BenchId::Copy, .label = "Copy", .weight = 2, .classic = true }, + Benchmark { .id = BenchId::Mul, .label = "Mul", .weight = 2, .classic = true }, + Benchmark { .id = BenchId::Add, .label = "Add", .weight = 3, .classic = true }, + Benchmark { .id = BenchId::Triad, .label = "Triad", .weight = 3, .classic = true }, + Benchmark { .id = BenchId::Dot, .label = "Dot", .weight = 2, .classic = true }, + Benchmark { .id = BenchId::Nstream, .label = "Nstream", .weight = 4, .classic = false } +}; + +// Selected benchmarks to run: default is all 5 classic benchmarks. +BenchId selection = BenchId::Classic; + +// Returns true if the benchmark needs to be run: +bool run_benchmark(Benchmark const& b) { + if (selection == BenchId::All) return true; + if (selection == BenchId::Classic && b.classic) return true; + return selection == b.id; +} + +// Benchmark run order +// - Classic: runs each bench once in the order above, and repeats n times. +// - Isolated: runs each bench n times in isolation +enum class BenchOrder : int {Classic, Isolated}; +BenchOrder order = BenchOrder::Classic; template void run(); -// Options for running the benchmark: -// - All 5 kernels (Copy, Add, Mul, Triad, Dot). -// - Triad only. -// - Nstream only. -enum class Benchmark {All, Triad, Nstream}; - -// Selected run options. -Benchmark selection = Benchmark::All; - void parseArguments(int argc, char *argv[]); int main(int argc, char *argv[]) { - parseArguments(argc, argv); if (!output_as_csv) @@ -95,106 +95,86 @@ int main(int argc, char *argv[]) if (use_float) run(); else - run(); + run(); + return EXIT_SUCCESS; } - -// Run the 5 main kernels -template -std::vector> run_all(Stream *stream, T& sum) -{ - - // List of times - std::vector> timings(5); - - // Declare timers - std::chrono::high_resolution_clock::time_point t1, t2; - - // Main loop - for (unsigned int k = 0; k < num_times; k++) - { - // Execute Copy - t1 = std::chrono::high_resolution_clock::now(); - stream->copy(); - t2 = std::chrono::high_resolution_clock::now(); - timings[0].push_back(std::chrono::duration_cast >(t2 - t1).count()); - - // Execute Mul - t1 = std::chrono::high_resolution_clock::now(); - stream->mul(); - t2 = std::chrono::high_resolution_clock::now(); - timings[1].push_back(std::chrono::duration_cast >(t2 - t1).count()); - - // Execute Add - t1 = std::chrono::high_resolution_clock::now(); - stream->add(); - t2 = std::chrono::high_resolution_clock::now(); - timings[2].push_back(std::chrono::duration_cast >(t2 - t1).count()); - - // Execute Triad - t1 = std::chrono::high_resolution_clock::now(); - stream->triad(); - t2 = std::chrono::high_resolution_clock::now(); - timings[3].push_back(std::chrono::duration_cast >(t2 - t1).count()); - - // Execute Dot - t1 = std::chrono::high_resolution_clock::now(); - sum = stream->dot(); - t2 = std::chrono::high_resolution_clock::now(); - timings[4].push_back(std::chrono::duration_cast >(t2 - t1).count()); - - } - - // Compiler should use a move - return timings; +// Returns duration of executing function f: +template +double time(F&& f) { + using clk_t = std::chrono::high_resolution_clock; + using dur_t = std::chrono::duration; + auto start = clk_t::now(); + f(); + return dur_t(clk_t::now() - start).count(); } -// Run the Triad kernel +// Run specified kernels template -std::vector> run_triad(Stream *stream) +std::vector> run_all(std::unique_ptr>& stream, T& sum) { + // Times for each measured benchmark: + std::vector> timings(num_benchmarks); - std::vector> timings(1); - - // Declare timers - std::chrono::high_resolution_clock::time_point t1, t2; - - // Run triad in loop - t1 = std::chrono::high_resolution_clock::now(); - for (unsigned int k = 0; k < num_times; k++) + // Run a particular benchmark + auto run = [&](Benchmark const& b) { - stream->triad(); - } - t2 = std::chrono::high_resolution_clock::now(); - - double runtime = std::chrono::duration_cast >(t2 - t1).count(); - timings[0].push_back(runtime); - - return timings; -} + switch(b.id) { + case BenchId::Copy: return stream->copy(); + case BenchId::Mul: return stream->mul(); + case BenchId::Add: return stream->add(); + case BenchId::Triad: return stream->triad(); + case BenchId::Dot: sum = stream->dot(); return; + case BenchId::Nstream: return stream->nstream(); + default: + std::cerr << "Unimplemented benchmark: " << b.label << std::endl; + abort(); + } + }; -// Run the Nstream kernel -template -std::vector> run_nstream(Stream *stream) -{ - std::vector> timings(1); + // Time a particular benchmark: + auto dt = [&](Benchmark const& b) { return time([&] { run(b); }); }; - // Declare timers - std::chrono::high_resolution_clock::time_point t1, t2; + // Reserve timings: + for (size_t i = 0; i < num_benchmarks; ++i) { + if (!run_benchmark(bench[i])) continue; + timings[i].reserve(num_times); + } - // Run nstream in loop - for (int k = 0; k < num_times; k++) { - t1 = std::chrono::high_resolution_clock::now(); - stream->nstream(); - t2 = std::chrono::high_resolution_clock::now(); - timings[0].push_back(std::chrono::duration_cast >(t2 - t1).count()); + switch(order) { + // Classic runs each benchmark once in the order specifies in the "bench" array above, + // and then repeats num_times: + case BenchOrder::Classic: { + for (size_t k = 0; k < num_times; k++) { + for (size_t i = 0; i < num_benchmarks; ++i) { + if (!run_benchmark(bench[i])) continue; + timings[i].push_back(dt(bench[i])); + } + } + break; + } + // Isolated runs each benchmark num_times, before proceeding to run the next benchmark: + case BenchOrder::Isolated: { + for (size_t i = 0; i < num_benchmarks; ++i) { + if (!run_benchmark(bench[i])) continue; + auto t = time([&] { for (size_t k = 0; k < num_times; k++) run(bench[i]); }); + timings[i].resize(num_times, t / (double)num_times); + } + break; + } + default: + std::cerr << "Unimplemented order" << std::endl; + abort(); } + // Compiler should use a move return timings; - } +template +void check_solution(const size_t ntimes, std::vector& a, std::vector& b, std::vector& c, + T& sum); // Generic run routine // Runs the kernel(s) and prints output. @@ -203,206 +183,110 @@ void run() { std::streamsize ss = std::cout.precision(); + // Formatting utilities: + auto fmt_bw = [&](size_t weight, double dt) { + return unit.fmt((weight * sizeof(T) * ARRAY_SIZE)/dt); + }; + auto fmt_csv_header = [] { + std::cout + << "function" << csv_separator + << "num_times" << csv_separator + << "n_elements" << csv_separator + << "sizeof" << csv_separator + << "max_" << unit.str() << "_per_sec" << csv_separator + << "min_runtime" << csv_separator + << "max_runtime" << csv_separator + << "avg_runtime" << std::endl; + }; + auto fmt_csv = [](char const* function, size_t num_times, size_t num_elements, + size_t type_size, double bandwidth, + double dt_min, double dt_max, double dt_avg) { + std::cout << function << csv_separator + << num_times << csv_separator + << num_elements << csv_separator + << type_size << csv_separator + << bandwidth << csv_separator + << dt_min << csv_separator + << dt_max << csv_separator + << dt_avg << std::endl; + }; + auto fmt_cli = [](char const* function, double bandwidth, + double dt_min, double dt_max, double dt_avg) { + std::cout + << std::left << std::setw(12) << function + << std::left << std::setw(12) << std::setprecision(3) << bandwidth + << std::left << std::setw(12) << std::setprecision(5) << dt_min + << std::left << std::setw(12) << std::setprecision(5) << dt_max + << std::left << std::setw(12) << std::setprecision(5) << dt_avg + << std::endl; + }; + auto fmt_result = [&](char const* function, size_t num_times, size_t num_elements, + size_t type_size, double bandwidth, + double dt_min, double dt_max, double dt_avg) { + if (!output_as_csv) return fmt_cli(function, bandwidth, dt_min, dt_max, dt_avg); + fmt_csv(function, num_times, num_elements, type_size, bandwidth, dt_min, dt_max, dt_avg); + }; + if (!output_as_csv) { - if (selection == Benchmark::All) - std::cout << "Running kernels " << num_times << " times" << std::endl; - else if (selection == Benchmark::Triad) - { - std::cout << "Running triad " << num_times << " times" << std::endl; - std::cout << "Number of elements: " << ARRAY_SIZE << std::endl; - } - - - if (sizeof(T) == sizeof(float)) - std::cout << "Precision: float" << std::endl; - else - std::cout << "Precision: double" << std::endl; - - - if (mibibytes) - { - // MiB = 2^20 - std::cout << std::setprecision(1) << std::fixed - << "Array size: " << ARRAY_SIZE*sizeof(T)*std::pow(2.0, -20.0) << " MiB" - << " (=" << ARRAY_SIZE*sizeof(T)*std::pow(2.0, -30.0) << " GiB)" << std::endl; - std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*std::pow(2.0, -20.0) << " MiB" - << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*std::pow(2.0, -30.0) << " GiB)" << std::endl; - } - else - { - // MB = 10^6 - std::cout << std::setprecision(1) << std::fixed - << "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB" - << " (=" << ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl; - std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB" - << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl; + std::cout << "Running "; + switch(selection) { + case BenchId::All: std::cout << " All kernels "; break; + case BenchId::Classic: std::cout << " Classic kernels "; break; + default: + std::cout << "Running "; + for (size_t i = 0; i < num_benchmarks; ++i) { + if (selection == bench[i].id) { + std::cout << bench[i].label; + break; + } + } + std::cout << " "; } + std::cout << num_times << " times" << std::endl; + std::cout << "Number of elements: " << ARRAY_SIZE << std::endl; + std::cout << "Precision: " << (sizeof(T) == sizeof(float)? "float" : "double") << std::endl; + + size_t nbytes = ARRAY_SIZE * sizeof(T); + std::cout << std::setprecision(1) << std::fixed + << "Array size: " << unit.fmt(nbytes) << " " << unit.str() << std::endl; + std::cout << "Total size: " << unit.fmt(3.0*nbytes) << " " << unit.str() << std::endl; std::cout.precision(ss); - } - Stream *stream; - -#if defined(CUDA) - // Use the CUDA implementation - stream = new CUDAStream(ARRAY_SIZE, deviceIndex); - -#elif defined(HIP) - // Use the HIP implementation - stream = new HIPStream(ARRAY_SIZE, deviceIndex); - -#elif defined(HC) - // Use the HC implementation - stream = new HCStream(ARRAY_SIZE, deviceIndex); - -#elif defined(OCL) - // Use the OpenCL implementation - stream = new OCLStream(ARRAY_SIZE, deviceIndex); - -#elif defined(USE_RAJA) - // Use the RAJA implementation - stream = new RAJAStream(ARRAY_SIZE, deviceIndex); - -#elif defined(KOKKOS) - // Use the Kokkos implementation - stream = new KokkosStream(ARRAY_SIZE, deviceIndex); - -#elif defined(STD_DATA) - // Use the C++ STD data-oriented implementation - stream = new STDDataStream(ARRAY_SIZE, deviceIndex); - -#elif defined(STD_INDICES) - // Use the C++ STD index-oriented implementation - stream = new STDIndicesStream(ARRAY_SIZE, deviceIndex); - -#elif defined(STD_RANGES) - // Use the C++ STD ranges implementation - stream = new STDRangesStream(ARRAY_SIZE, deviceIndex); - -#elif defined(TBB) - // Use the C++20 implementation - stream = new TBBStream(ARRAY_SIZE, deviceIndex); - -#elif defined(THRUST) - // Use the Thrust implementation - stream = new ThrustStream(ARRAY_SIZE, deviceIndex); - -#elif defined(ACC) - // Use the OpenACC implementation - stream = new ACCStream(ARRAY_SIZE, deviceIndex); - -#elif defined(SYCL) || defined(SYCL2020) - // Use the SYCL implementation - stream = new SYCLStream(ARRAY_SIZE, deviceIndex); - -#elif defined(OMP) - // Use the OpenMP implementation - stream = new OMPStream(ARRAY_SIZE, deviceIndex); - -#elif defined(FUTHARK) - // Use the Futhark implementation - stream = new FutharkStream(ARRAY_SIZE, deviceIndex); - -#endif - - auto init1 = std::chrono::high_resolution_clock::now(); - stream->init_arrays(startA, startB, startC); - auto init2 = std::chrono::high_resolution_clock::now(); + std::unique_ptr> stream = make_stream(ARRAY_SIZE, deviceIndex); + auto initElapsedS = time([&] { stream->init_arrays(startA, startB, startC); }); // Result of the Dot kernel, if used. T sum{}; + std::vector> timings = run_all(stream, sum); - std::vector> timings; - - switch (selection) - { - case Benchmark::All: - timings = run_all(stream, sum); - break; - case Benchmark::Triad: - timings = run_triad(stream); - break; - case Benchmark::Nstream: - timings = run_nstream(stream); - break; - }; - - // Check solutions - // Create host vectors - std::vector a(ARRAY_SIZE); - std::vector b(ARRAY_SIZE); - std::vector c(ARRAY_SIZE); - - - auto read1 = std::chrono::high_resolution_clock::now(); - stream->read_arrays(a, b, c); - auto read2 = std::chrono::high_resolution_clock::now(); + // Create & read host vectors: + std::vector a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE); + auto readElapsedS = time([&] { stream->read_arrays(a, b, c); }); - auto initElapsedS = std::chrono::duration_cast>(read2 - read1).count(); - auto readElapsedS = std::chrono::duration_cast>(init2 - init1).count(); - auto initBWps = ((mibibytes ? std::pow(2.0, -20.0) : 1.0E-6) * (3 * sizeof(T) * ARRAY_SIZE)) / initElapsedS; - auto readBWps = ((mibibytes ? std::pow(2.0, -20.0) : 1.0E-6) * (3 * sizeof(T) * ARRAY_SIZE)) / readElapsedS; + check_solution(num_times, a, b, c, sum); + auto initBWps = fmt_bw(3, initElapsedS); + auto readBWps = fmt_bw(3, readElapsedS); if (output_as_csv) { - std::cout - << "phase" << csv_separator - << "n_elements" << csv_separator - << "sizeof" << csv_separator - << ((mibibytes) ? "max_mibytes_per_sec" : "max_mbytes_per_sec") << csv_separator - << "runtime" << std::endl; - std::cout - << "Init" << csv_separator - << ARRAY_SIZE << csv_separator - << sizeof(T) << csv_separator - << initBWps << csv_separator - << initElapsedS << std::endl; - std::cout - << "Read" << csv_separator - << ARRAY_SIZE << csv_separator - << sizeof(T) << csv_separator - << readBWps << csv_separator - << readElapsedS << std::endl; + fmt_csv_header(); + fmt_csv("Init", 1, ARRAY_SIZE, sizeof(T), initBWps, initElapsedS, initElapsedS, initElapsedS); + fmt_csv("Read", 1, ARRAY_SIZE, sizeof(T), readBWps, readElapsedS, readElapsedS, readElapsedS); } else { std::cout << "Init: " << std::setw(7) - << initElapsedS - << " s (=" - << initBWps - << (mibibytes ? " MiBytes/sec" : " MBytes/sec") - << ")" << std::endl; + << initElapsedS << " s (=" << initBWps << " " << unit.str() << "/s" << ")" << std::endl; std::cout << "Read: " << std::setw(7) - << readElapsedS - << " s (=" - << readBWps - << (mibibytes ? " MiBytes/sec" : " MBytes/sec") - << ")" << std::endl; - } + << readElapsedS << " s (=" << readBWps << " " << unit.str() << "/s" << ")" << std::endl; - check_solution(num_times, a, b, c, sum); - - // Display timing results - if (output_as_csv) - { - std::cout - << "function" << csv_separator - << "num_times" << csv_separator - << "n_elements" << csv_separator - << "sizeof" << csv_separator - << ((mibibytes) ? "max_mibytes_per_sec" : "max_mbytes_per_sec") << csv_separator - << "min_runtime" << csv_separator - << "max_runtime" << csv_separator - << "avg_runtime" << std::endl; - } - else - { std::cout << std::left << std::setw(12) << "Function" - << std::left << std::setw(12) << ((mibibytes) ? "MiBytes/sec" : "MBytes/sec") + << std::left << std::setw(12) << (std::string(unit.str()) + "/s") << std::left << std::setw(12) << "Min (sec)" << std::left << std::setw(12) << "Max" << std::left << std::setw(12) << "Average" @@ -410,186 +294,135 @@ void run() << std::fixed; } - - if (selection == Benchmark::All || selection == Benchmark::Nstream) + for (size_t i = 0; i < num_benchmarks; ++i) { + if (!run_benchmark(bench[i])) continue; - std::vector labels; - std::vector sizes; + // Get min/max; ignore the first result + auto minmax = std::minmax_element(timings[i].begin()+1, timings[i].end()); - if (selection == Benchmark::All) - { - labels = {"Copy", "Mul", "Add", "Triad", "Dot"}; - sizes = { - 2 * sizeof(T) * ARRAY_SIZE, - 2 * sizeof(T) * ARRAY_SIZE, - 3 * sizeof(T) * ARRAY_SIZE, - 3 * sizeof(T) * ARRAY_SIZE, - 2 * sizeof(T) * ARRAY_SIZE}; - } else if (selection == Benchmark::Nstream) - { - labels = {"Nstream"}; - sizes = {4 * sizeof(T) * ARRAY_SIZE }; - } + // Calculate average; ignore the first result + double average = std::accumulate(timings[i].begin()+1, timings[i].end(), 0.0) + / (double)(num_times - 1); - for (int i = 0; i < timings.size(); ++i) - { - // Get min/max; ignore the first result - auto minmax = std::minmax_element(timings[i].begin()+1, timings[i].end()); - - // Calculate average; ignore the first result - double average = std::accumulate(timings[i].begin()+1, timings[i].end(), 0.0) / (double)(num_times - 1); - - // Display results - if (output_as_csv) - { - std::cout - << labels[i] << csv_separator - << num_times << csv_separator - << ARRAY_SIZE << csv_separator - << sizeof(T) << csv_separator - << ((mibibytes) ? std::pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first) << csv_separator - << *minmax.first << csv_separator - << *minmax.second << csv_separator - << average - << std::endl; - } - else - { - std::cout - << std::left << std::setw(12) << labels[i] - << std::left << std::setw(12) << std::setprecision(3) << - ((mibibytes) ? std::pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first) - << std::left << std::setw(12) << std::setprecision(5) << *minmax.first - << std::left << std::setw(12) << std::setprecision(5) << *minmax.second - << std::left << std::setw(12) << std::setprecision(5) << average - << std::endl; - } - } - } else if (selection == Benchmark::Triad) - { - // Display timing results - double total_bytes = 3 * sizeof(T) * ARRAY_SIZE * num_times; - double bandwidth = ((mibibytes) ? std::pow(2.0, -30.0) : 1.0E-9) * (total_bytes / timings[0][0]); - - if (output_as_csv) - { - std::cout - << "function" << csv_separator - << "num_times" << csv_separator - << "n_elements" << csv_separator - << "sizeof" << csv_separator - << ((mibibytes) ? "gibytes_per_sec" : "gbytes_per_sec") << csv_separator - << "runtime" - << std::endl; - std::cout - << "Triad" << csv_separator - << num_times << csv_separator - << ARRAY_SIZE << csv_separator - << sizeof(T) << csv_separator - << bandwidth << csv_separator - << timings[0][0] - << std::endl; - } - else - { - std::cout - << "--------------------------------" - << std::endl << std::fixed - << "Runtime (seconds): " << std::left << std::setprecision(5) - << timings[0][0] << std::endl - << "Bandwidth (" << ((mibibytes) ? "GiB/s" : "GB/s") << "): " - << std::left << std::setprecision(3) - << bandwidth << std::endl; - } + // Display results + fmt_result(bench[i].label, num_times, ARRAY_SIZE, sizeof(T), + fmt_bw(bench[i].weight, *minmax.first), *minmax.first, *minmax.second, average); } - - delete stream; - } - template -void check_solution(const unsigned int ntimes, std::vector& a, std::vector& b, std::vector& c, T& sum) +void check_solution(const size_t num_times, + std::vector& a, std::vector& b, std::vector& c, T& sum) { // Generate correct solution T goldA = startA; T goldB = startB; T goldC = startC; - T goldSum{}; + T goldS = T(0.); const T scalar = startScalar; - for (unsigned int i = 0; i < ntimes; i++) - { - // Do STREAM! - if (selection == Benchmark::All) - { - goldC = goldA; - goldB = scalar * goldC; - goldC = goldA + goldB; - goldA = goldB + scalar * goldC; - } else if (selection == Benchmark::Triad) - { - goldA = goldB + scalar * goldC; - } else if (selection == Benchmark::Nstream) - { - goldA += goldB + scalar * goldC; + // Updates output due to running each benchmark: + auto run = [&](int b) { + switch(bench[b].id) { + case BenchId::Copy: goldC = goldA; break; + case BenchId::Mul: goldB = scalar * goldC; break; + case BenchId::Add: goldC = goldA + goldB; break; + case BenchId::Triad: goldA = goldB + scalar * goldC; break; + case BenchId::Nstream: goldA += goldB + scalar * goldC; break; + case BenchId::Dot: goldS = goldA * goldB * T(ARRAY_SIZE); break; // This calculates the answer exactly + default: + std::cerr << "Unimplemented Check: " << bench[b].label << std::endl; + abort(); + } + }; + + switch(order) { + // Classic runs each benchmark once in the order specifies in the "bench" array above, + // and then repeats num_times: + case BenchOrder::Classic: { + for (size_t k = 0; k < num_times; k++) { + for (size_t i = 0; i < num_benchmarks; ++i) { + if (!run_benchmark(bench[i])) continue; + run(i); + } } + break; + } + // Isolated runs each benchmark num_times, before proceeding to run the next benchmark: + case BenchOrder::Isolated: { + for (size_t i = 0; i < num_benchmarks; ++i) { + if (!run_benchmark(bench[i])) continue; + for (size_t k = 0; k < num_times; k++) run(i); + } + break; + } + default: + std::cerr << "Unimplemented order" << std::endl; + abort(); } - // Do the reduction - goldSum = goldA * goldB * ARRAY_SIZE; + // Error relative tolerance check + size_t failed = 0; + T epsi = std::numeric_limits::epsilon() * T(100000.0); + auto check = [&](const char* name, T is, T should, T e, size_t i = size_t(-1)) { + if (e > epsi) { + ++failed; + if (failed > 10) return; + std::cerr << "FAILED validation of " << name; + if (i != size_t(-1)) std::cerr << "[" << i << "]"; + std::cerr << ": " << is << " != " << should + << ", relative error=" << e << " > " << epsi << std::endl; + } + }; - // Calculate the average error - long double errA = std::accumulate(a.begin(), a.end(), T{}, [&](double sum, const T val){ return sum + std::fabs(val - goldA); }); - errA /= a.size(); - long double errB = std::accumulate(b.begin(), b.end(), T{}, [&](double sum, const T val){ return sum + std::fabs(val - goldB); }); - errB /= b.size(); - long double errC = std::accumulate(c.begin(), c.end(), T{}, [&](double sum, const T val){ return sum + std::fabs(val - goldC); }); - errC /= c.size(); - long double errSum = std::fabs((sum - goldSum)/goldSum); + // Sum + T eS = std::fabs(sum - goldS) / std::fabs(goldS); + for (size_t i = 0; i < num_benchmarks; ++i) { + if (bench[i].id != BenchId::Dot) continue; + if (run_benchmark(bench[i])) + check("sum", sum, goldS, eS); + break; + } - long double epsi = std::numeric_limits::epsilon() * 100.0; + // Calculate the L^infty-norm relative error + for (size_t i = 0; i < a.size(); ++i) { + T vA = a[i], vB = b[i], vC = c[i]; + T eA = std::fabs(vA - goldA) / std::fabs(goldA); + T eB = std::fabs(vB - goldB) / std::fabs(goldB); + T eC = std::fabs(vC - goldC) / std::fabs(goldC); - if (errA > epsi) - std::cerr - << "Validation failed on a[]. Average error " << errA - << std::endl; - if (errB > epsi) - std::cerr - << "Validation failed on b[]. Average error " << errB - << std::endl; - if (errC > epsi) - std::cerr - << "Validation failed on c[]. Average error " << errC - << std::endl; - // Check sum to 8 decimal places - if (selection == Benchmark::All && errSum > 1.0E-8) - std::cerr - << "Validation failed on sum. Error " << errSum - << std::endl << std::setprecision(15) - << "Sum was " << sum << " but should be " << goldSum - << std::endl; + check("a", a[i], goldA, eA, i); + check("b", b[i], goldB, eB, i); + check("c", c[i], goldC, eC, i); + } + if (failed > 0 && !silence_errors) + std::exit(EXIT_FAILURE); } -int parseUInt(const char *str, unsigned int *output) +void parseArguments(int argc, char *argv[]) { - char *next; - *output = strtoul(str, &next, 10); - return !strlen(next); -} + auto parseUInt =[](const char *str, size_t *output) { + char *next; + *output = strtoull(str, &next, 10); + return !strlen(next); + }; + auto parseInt = [](const char *str, intptr_t *output) { + char *next; + *output = strtoll(str, &next, 10); + return !strlen(next); + }; -int parseInt(const char *str, int *output) -{ - char *next; - *output = strtol(str, &next, 10); - return !strlen(next); -} + // Prints all available benchmark labels: + auto print_labels = [&](auto& os) { + for (size_t i = 0; i < num_benchmarks; ++i) { + os << bench[i].label; + if (i != (num_benchmarks - 1)) os << ","; + } + }; -void parseArguments(int argc, char *argv[]) -{ for (int i = 1; i < argc; i++) { if (!std::string("--list").compare(argv[i])) @@ -608,11 +441,13 @@ void parseArguments(int argc, char *argv[]) else if (!std::string("--arraysize").compare(argv[i]) || !std::string("-s").compare(argv[i])) { - if (++i >= argc || !parseInt(argv[i], &ARRAY_SIZE) || ARRAY_SIZE <= 0) + intptr_t array_size; + if (++i >= argc || !parseInt(argv[i], &array_size) || array_size <= 0) { std::cerr << "Invalid array size." << std::endl; - exit(EXIT_FAILURE); + std::exit(EXIT_FAILURE); } + ARRAY_SIZE = array_size; } else if (!std::string("--numtimes").compare(argv[i]) || !std::string("-n").compare(argv[i])) @@ -632,13 +467,57 @@ void parseArguments(int argc, char *argv[]) { use_float = true; } - else if (!std::string("--triad-only").compare(argv[i])) + else if (!std::string("--print-names").compare(argv[i])) + { + std::cout << "Available benchmarks: "; + print_labels(std::cout); + std::cout << std::endl; + std::exit(EXIT_SUCCESS); + } + else if (!std::string("--only").compare(argv[i]) || !std::string("-o").compare(argv[i])) { - selection = Benchmark::Triad; + if (++i >= argc) + { + std::cerr << "Expected benchmark name after --only" << std::endl; + std::exit(EXIT_FAILURE); + } + auto key = std::string(argv[i]); + if (key == "Classic") + { + selection = BenchId::Classic; + } + else if (key == "All") + { + selection = BenchId::All; + } + else + { + auto p = std::find_if(bench.begin(), bench.end(), [&](Benchmark const& b) { + return std::string(b.label) == key; + }); + if (p == bench.end()) { + std::cerr << "Unknown benchmark name \"" << argv[i] << "\" after --only" << std::endl; + std::cerr << "Available benchmarks: All, Classic,"; + print_labels(std::cerr); + std::cerr << std::endl; + std::exit(EXIT_FAILURE); + } + selection = p->id; + } } - else if (!std::string("--nstream-only").compare(argv[i])) + else if (!std::string("--order").compare(argv[i])) { - selection = Benchmark::Nstream; + if (++i >= argc) + { + std::cerr << "Expected benchmark order after --order. Options: \"classic\" (default), \"isolated\"." + << std::endl; + exit(EXIT_FAILURE); + } + auto key = std::string(argv[i]); + if (key == "isolated") + { + order = BenchOrder::Isolated; + } } else if (!std::string("--csv").compare(argv[i])) { @@ -646,7 +525,31 @@ void parseArguments(int argc, char *argv[]) } else if (!std::string("--mibibytes").compare(argv[i])) { - mibibytes = true; + unit = Unit(Unit::Kind::MibiByte); + } + else if (!std::string("--megabytes").compare(argv[i])) + { + unit = Unit(Unit::Kind::MegaByte); + } + else if (!std::string("--gibibytes").compare(argv[i])) + { + unit = Unit(Unit::Kind::GibiByte); + } + else if (!std::string("--gigabytes").compare(argv[i])) + { + unit = Unit(Unit::Kind::GigaByte); + } + else if (!std::string("--tebibytes").compare(argv[i])) + { + unit = Unit(Unit::Kind::TebiByte); + } + else if (!std::string("--terabytes").compare(argv[i])) + { + unit = Unit(Unit::Kind::TeraByte); + } + else if (!std::string("--silence-errors").compare(argv[i])) + { + silence_errors = true; } else if (!std::string("--help").compare(argv[i]) || !std::string("-h").compare(argv[i])) @@ -660,18 +563,25 @@ void parseArguments(int argc, char *argv[]) std::cout << " -s --arraysize SIZE Use SIZE elements in the array" << std::endl; std::cout << " -n --numtimes NUM Run the test NUM times (NUM >= 2)" << std::endl; std::cout << " --float Use floats (rather than doubles)" << std::endl; - std::cout << " --triad-only Only run triad" << std::endl; - std::cout << " --nstream-only Only run nstream" << std::endl; + std::cout << " -o --only NAME Only run one benchmark (see --print-names)" << std::endl; + std::cout << " --print-names Prints all available benchmark names" << std::endl; + std::cout << " --order Benchmark run order: \"classic\" (default) or \"isolated\"." << std::endl; std::cout << " --csv Output as csv table" << std::endl; + std::cout << " --megabytes Use MB=10^6 for bandwidth calculation (default)" << std::endl; std::cout << " --mibibytes Use MiB=2^20 for bandwidth calculation (default MB=10^6)" << std::endl; + std::cout << " --gibibytes Use GiB=2^30 for bandwidth calculation (default MB=10^6)" << std::endl; + std::cout << " --gigabytes Use GB=10^9 for bandwidth calculation (default MB=10^6)" << std::endl; + std::cout << " --tebibytes Use TiB=2^40 for bandwidth calculation (default MB=10^6)" << std::endl; + std::cout << " --terabytes Use TB=10^12 for bandwidth calculation (default MB=10^6)" << std::endl; + std::cout << " --silence-errors Ignores validation errors." << std::endl; std::cout << std::endl; - exit(EXIT_SUCCESS); + std::exit(EXIT_SUCCESS); } else { std::cerr << "Unrecognized argument '" << argv[i] << "' (try '--help')" << std::endl; - exit(EXIT_FAILURE); + std::exit(EXIT_FAILURE); } } } diff --git a/src/omp/model.cmake b/src/omp/model.cmake index 1955ebc8..56f37cf5 100644 --- a/src/omp/model.cmake +++ b/src/omp/model.cmake @@ -151,7 +151,7 @@ macro(setup) # offload but OFFLOAD_FLAGS overrides register_definitions(OMP_TARGET_GPU) separate_arguments(OFFLOAD_FLAGS) - list(OMP_FLAGS APPEND ${OFFLOAD_FLAGS}) + list(APPEND OMP_FLAGS ${OFFLOAD_FLAGS}) else () # handle the vendor:arch value