diff --git a/CMakeLists.txt b/CMakeLists.txt
index 27736b6b..c98cab48 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,8 +31,8 @@ endmacro()
 # the final executable name
 set(EXE_NAME babelstream)
 
-# for chrono and some basic CXX features, models can overwrite this if required
-set(CMAKE_CXX_STANDARD 11)
+# for chrono, make_unique, and some basic CXX features, models can overwrite this if required
+set(CMAKE_CXX_STANDARD 14)
 
 if (NOT CMAKE_BUILD_TYPE)
     message("No CMAKE_BUILD_TYPE specified, defaulting to 'Release'")
diff --git a/src/Stream.h b/src/Stream.h
index eb4ffd4f..45c144c3 100644
--- a/src/Stream.h
+++ b/src/Stream.h
@@ -20,7 +20,6 @@ template <class T>
 class Stream
 {
   public:
-
     virtual ~Stream(){}
 
     // Kernels
@@ -35,10 +34,8 @@ class Stream
     // Copy memory between host and device
     virtual void init_arrays(T initA, T initB, T initC) = 0;
     virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) = 0;
-
 };
 
-
 // Implementation specific device functions
 void listDevices(void);
 std::string getDeviceName(const int);
diff --git a/src/StreamModels.h b/src/StreamModels.h
new file mode 100644
index 00000000..0a4649b9
--- /dev/null
+++ b/src/StreamModels.h
@@ -0,0 +1,105 @@
+#pragma once
+#include <memory>
+
+#if defined(CUDA)
+#include "CUDAStream.h"
+#elif defined(STD_DATA)
+#include "STDDataStream.h"
+#elif defined(STD_INDICES)
+#include "STDIndicesStream.h"
+#elif defined(STD_RANGES)
+#include "STDRangesStream.hpp"
+#elif defined(TBB)
+#include "TBBStream.hpp"
+#elif defined(THRUST)
+#include "ThrustStream.h"
+#elif defined(HIP)
+#include "HIPStream.h"
+#elif defined(HC)
+#include "HCStream.h"
+#elif defined(OCL)
+#include "OCLStream.h"
+#elif defined(USE_RAJA)
+#include "RAJAStream.hpp"
+#elif defined(KOKKOS)
+#include "KokkosStream.hpp"
+#elif defined(ACC)
+#include "ACCStream.h"
+#elif defined(SYCL)
+#include "SYCLStream.h"
+#elif defined(SYCL2020)
+#include "SYCLStream2020.h"
+#elif defined(OMP)
+#include "OMPStream.h"
+#elif defined(FUTHARK)
+#include "FutharkStream.h"
+#endif
+
+template <typename T>
+std::unique_ptr<Stream<T>> make_stream(int array_size, int deviceIndex) {
+#if defined(CUDA)
+  // Use the CUDA implementation
+  return std::make_unique<CUDAStream<T>>(array_size, deviceIndex);
+
+#elif defined(HIP)
+  // Use the HIP implementation
+  return std::make_unique<HIPStream<T>>(array_size, deviceIndex);
+
+#elif defined(HC)
+  // Use the HC implementation
+  return std::make_unique<HCStream<T>>(array_size, deviceIndex);
+
+#elif defined(OCL)
+  // Use the OpenCL implementation
+  return std::make_unique<OCLStream<T>>(array_size, deviceIndex);
+
+#elif defined(USE_RAJA)
+  // Use the RAJA implementation
+  return std::make_unique<RAJAStream<T>>(array_size, deviceIndex);
+
+#elif defined(KOKKOS)
+  // Use the Kokkos implementation
+  return std::make_unique<KokkosStream<T>>(array_size, deviceIndex);
+
+#elif defined(STD_DATA)
+  // Use the C++ STD data-oriented implementation
+  return std::make_unique<STDDataStream<T>>(array_size, deviceIndex);
+
+#elif defined(STD_INDICES)
+  // Use the C++ STD index-oriented implementation
+  return std::make_unique<STDIndicesStream<T>>(array_size, deviceIndex);
+
+#elif defined(STD_RANGES)
+  // Use the C++ STD ranges implementation
+  return std::make_unique<STDRangesStream<T>>(array_size, deviceIndex);
+
+#elif defined(TBB)
+  // Use the C++20 implementation
+  return std::make_unique<TBBStream<T>>(array_size, deviceIndex);
+
+#elif defined(THRUST)
+  // Use the Thrust implementation
+  return std::make_unique<ThrustStream<T>>(array_size, deviceIndex);
+
+#elif defined(ACC)
+  // Use the OpenACC implementation
+  return std::make_unique<ACCStream<T>>(array_size, deviceIndex);
+
+#elif defined(SYCL) || defined(SYCL2020)
+  // Use the SYCL implementation
+  return std::make_unique<SYCLStream<T>>(array_size, deviceIndex);
+
+#elif defined(OMP)
+  // Use the OpenMP implementation
+  return std::make_unique<OMPStream<T>>(array_size, deviceIndex);
+
+#elif defined(FUTHARK)
+  // Use the Futhark implementation
+  return std::make_unique<FutharkStream<T>>(array_size, deviceIndex);
+
+#else
+
+  #error unknown benchmark
+
+#endif
+}
diff --git a/src/Unit.h b/src/Unit.h
new file mode 100644
index 00000000..2efde5d6
--- /dev/null
+++ b/src/Unit.h
@@ -0,0 +1,35 @@
+#pragma once
+#include <iostream>
+
+// Units for output:
+struct Unit {
+  enum class Kind { MegaByte, GigaByte, TeraByte, MibiByte, GibiByte, TebiByte };
+
+  Kind value;
+  
+  explicit Unit(Kind v) : value(v) {}
+
+  double fmt(double bytes) const {
+    switch(value) {
+    case Kind::MibiByte: return std::pow(2.0, -20.0) * bytes;
+    case Kind::MegaByte: return 1.0E-6 * bytes;
+    case Kind::GibiByte: return std::pow(2.0, -30.0) * bytes;
+    case Kind::GigaByte: return 1.0E-9 * bytes;
+    case Kind::TebiByte: return std::pow(2.0, -40.0) * bytes;
+    case Kind::TeraByte: return 1.0E-12 * bytes;      
+    default: std::cerr << "Unimplemented!" << std::endl; std::abort();
+    }
+  }
+
+  char const* str() const {
+    switch(value) {
+    case Kind::MibiByte: return "MiB";
+    case Kind::MegaByte: return "MB";
+    case Kind::GibiByte: return "GiB";
+    case Kind::GigaByte: return "GB";
+    case Kind::TebiByte: return "TiB";
+    case Kind::TeraByte: return "TB";
+    default: std::cerr << "Unimplemented!" << std::endl; std::abort();
+    }
+  }
+};
diff --git a/src/cuda/CUDAStream.cu b/src/cuda/CUDAStream.cu
index cff1951a..a6361139 100644
--- a/src/cuda/CUDAStream.cu
+++ b/src/cuda/CUDAStream.cu
@@ -1,11 +1,9 @@
-
 // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
 // University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
 
-
 #include "CUDAStream.h"
 
 [[noreturn]] inline void error(char const* file, int line, char const* expr, cudaError_t e) {
@@ -17,12 +15,13 @@
 #define CU(EXPR) do { auto __e = (EXPR); if (__e != cudaSuccess) error(__FILE__, __LINE__, #EXPR, __e); } while(false)
 
 // It is best practice to include __device__ and constexpr even though in BabelStream it only needs to be __host__ const
-__host__ __device__ constexpr size_t ceil_div(size_t a, size_t b) { return (a + b - 1)/b; }
+__host__ __device__ constexpr size_t ceil_div(size_t a, size_t b) { return (a + b - 1) / b; }
 
 cudaStream_t stream;
 
 template <class T>
-CUDAStream<T>::CUDAStream(const int ARRAY_SIZE, const int device_index)
+CUDAStream<T>::CUDAStream(const int array_size, const int device_index)
+  : array_size(array_size)
 {
   // Set device
   int count;
@@ -43,20 +42,16 @@ CUDAStream<T>::CUDAStream(const int ARRAY_SIZE, const int device_index)
 #else
   std::cout << "Memory: DEFAULT" << std::endl;
 #endif
-  array_size = ARRAY_SIZE;
-
 
   // Query device for sensible dot kernel block count
   cudaDeviceProp props;
   CU(cudaGetDeviceProperties(&props, device_index));
   dot_num_blocks = props.multiProcessorCount * 4;
 
-  // Allocate the host array for partial sums for dot kernels
-  sums = (T*)malloc(sizeof(T) * dot_num_blocks);
-
-  size_t array_bytes = sizeof(T);
-  array_bytes *= ARRAY_SIZE;
-  size_t total_bytes = array_bytes * 4;
+  // Size of partial sums for dot kernels
+  size_t sums_bytes = sizeof(T) * dot_num_blocks;
+  size_t array_bytes = sizeof(T) * array_size;
+  size_t total_bytes = array_bytes * size_t(3) + sums_bytes;
   std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE << std::endl;
 
   // Check buffers fit on the device
@@ -68,45 +63,42 @@ CUDAStream<T>::CUDAStream(const int ARRAY_SIZE, const int device_index)
   CU(cudaMallocManaged(&d_a, array_bytes));
   CU(cudaMallocManaged(&d_b, array_bytes));
   CU(cudaMallocManaged(&d_c, array_bytes));
-  CU(cudaMallocManaged(&d_sum, dot_num_blocks*sizeof(T)));
+  CU(cudaHostAlloc(&sums, sums_bytes, cudaHostAllocDefault));
 #elif defined(PAGEFAULT)
   d_a = (T*)malloc(array_bytes);
   d_b = (T*)malloc(array_bytes);
   d_c = (T*)malloc(array_bytes);
-  d_sum = (T*)malloc(sizeof(T)*dot_num_blocks);
+  sums = (T*)malloc(sums_bytes);
 #else
   CU(cudaMalloc(&d_a, array_bytes));
   CU(cudaMalloc(&d_b, array_bytes));
   CU(cudaMalloc(&d_c, array_bytes));
-  CU(cudaMalloc(&d_sum, dot_num_blocks*sizeof(T)));
+  CU(cudaHostAlloc(&sums, sums_bytes, cudaHostAllocDefault));
 #endif
 }
 
-
 template <class T>
 CUDAStream<T>::~CUDAStream()
 {
   CU(cudaStreamDestroy(stream));
-  free(sums);
 
 #if defined(PAGEFAULT)
   free(d_a);
   free(d_b);
   free(d_c);
-  free(d_sum);
+  free(sums);
 #else
   CU(cudaFree(d_a));
   CU(cudaFree(d_b));
   CU(cudaFree(d_c));
-  CU(cudaFree(d_sum));
+  CU(cudaFreeHost(sums));
 #endif
 }
 
-
 template <typename T>
 __global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC, int array_size)
 {  
-  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
+  for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
     a[i] = initA;
     b[i] = initB;
     c[i] = initC;
@@ -128,7 +120,7 @@ void CUDAStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vecto
   // Copy device memory to host
 #if defined(PAGEFAULT) || defined(MANAGED)
   CU(cudaStreamSynchronize(stream));
-  for (int i = 0; i < array_size; i++)
+  for (int i = 0; i < array_size; ++i)
   {
     a[i] = d_a[i];
     b[i] = d_b[i];
@@ -141,11 +133,10 @@ void CUDAStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vecto
 #endif
 }
 
-
 template <typename T>
 __global__ void copy_kernel(const T * a, T * c, int array_size)
 {
-  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
+  for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
     c[i] = a[i];
   }
 }
@@ -163,7 +154,7 @@ template <typename T>
 __global__ void mul_kernel(T * b, const T * c, int array_size)
 {
   const T scalar = startScalar;
-  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
+  for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
     b[i] = scalar * c[i];
   }
 }
@@ -180,7 +171,7 @@ void CUDAStream<T>::mul()
 template <typename T>
 __global__ void add_kernel(const T * a, const T * b, T * c, int array_size)
 {
-  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
+  for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
     c[i] = a[i] + b[i];
   }
 }
@@ -198,7 +189,7 @@ template <typename T>
 __global__ void triad_kernel(T * a, const T * b, const T * c, int array_size)
 {
   const T scalar = startScalar;
-  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
+  for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
     a[i] = b[i] + scalar * c[i];
   }
 }
@@ -216,7 +207,7 @@ template <typename T>
 __global__ void nstream_kernel(T * a, const T * b, const T * c, int array_size)
 {
   const T scalar = startScalar;
-  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
+  for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
     a[i] += b[i] + scalar * c[i];
   }
 }
@@ -231,50 +222,34 @@ void CUDAStream<T>::nstream()
 }
 
 template <class T>
-__global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size)
+__global__ void dot_kernel(const T * a, const T * b, T* sums, int array_size)
 {
-  __shared__ T tb_sum[TBSIZE];
-
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
-  const size_t local_i = threadIdx.x;
-
-  tb_sum[local_i] = {};
-  for (; i < array_size; i += blockDim.x*gridDim.x)
-    tb_sum[local_i] += a[i] * b[i];
+  __shared__ T smem[TBSIZE];
+  T tmp = T(0.);
+  const size_t tidx = threadIdx.x;
+  for (int i = tidx + (size_t)blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
+    tmp += a[i] * b[i];
+  }
+  smem[tidx] = tmp;
 
-  for (int offset = blockDim.x / 2; offset > 0; offset /= 2)
-  {
+  for (int offset = blockDim.x / 2; offset > 0; offset /= 2) {
     __syncthreads();
-    if (local_i < offset)
-    {
-      tb_sum[local_i] += tb_sum[local_i+offset];
-    }
+    if (tidx < offset) smem[tidx] += smem[tidx+offset];
   }
 
-  if (local_i == 0)
-    sum[blockIdx.x] = tb_sum[local_i];
+  // First thread writes to host memory directly from the device
+  if (tidx == 0) sums[blockIdx.x] = smem[tidx];
 }
 
 template <class T>
 T CUDAStream<T>::dot()
 {
-  dot_kernel<<<dot_num_blocks, TBSIZE, 0, stream>>>(d_a, d_b, d_sum, array_size);
+  dot_kernel<<<dot_num_blocks, TBSIZE, 0, stream>>>(d_a, d_b, sums, array_size);
   CU(cudaPeekAtLastError());
-
-#if !(defined(MANAGED) || defined(PAGEFAULT))
-  CU(cudaMemcpyAsync(sums, d_sum, dot_num_blocks*sizeof(T), cudaMemcpyDeviceToHost, stream));
-#endif
   CU(cudaStreamSynchronize(stream));
 
   T sum = 0.0;
-  for (int i = 0; i < dot_num_blocks; i++)
-  {
-#if defined(MANAGED) || defined(PAGEFAULT)
-    sum += d_sum[i];
-#else
-    sum += sums[i];
-#endif
-  }
+  for (int i = 0; i < dot_num_blocks; ++i) sum += sums[i];
 
   return sum;
 }
@@ -302,7 +277,6 @@ void listDevices(void)
   }
 }
 
-
 std::string getDeviceName(const int device)
 {
   cudaDeviceProp props;
@@ -310,7 +284,6 @@ std::string getDeviceName(const int device)
   return std::string(props.name);
 }
 
-
 std::string getDeviceDriver(const int device)
 {
   CU(cudaSetDevice(device));
diff --git a/src/cuda/CUDAStream.h b/src/cuda/CUDAStream.h
index d16511fe..54bf2a18 100644
--- a/src/cuda/CUDAStream.h
+++ b/src/cuda/CUDAStream.h
@@ -31,13 +31,11 @@ class CUDAStream : public Stream<T>
     T *d_a;
     T *d_b;
     T *d_c;
-    T *d_sum;
 
     // Number of blocks for dot kernel
     int dot_num_blocks;
 
   public:
-
     CUDAStream(const int, const int);
     ~CUDAStream();
 
@@ -50,5 +48,4 @@ class CUDAStream : public Stream<T>
 
     virtual void init_arrays(T initA, T initB, T initC) override;
     virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
-
 };
diff --git a/src/main.cpp b/src/main.cpp
index abfc14e4..877127af 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -1,87 +1,87 @@
-
 // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
 // University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
 
-#include <iostream>
-#include <vector>
-#include <numeric>
-#include <cmath>
-#include <limits>
-#include <chrono>
 #include <algorithm>
-#include <iomanip>
+#include <array>
+#include <chrono>
+#include <cmath>
 #include <cstring>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <numeric>
+#include <vector>
 
 #define VERSION_STRING "5.0"
 
 #include "Stream.h"
-
-#if defined(CUDA)
-#include "CUDAStream.h"
-#elif defined(STD_DATA)
-#include "STDDataStream.h"
-#elif defined(STD_INDICES)
-#include "STDIndicesStream.h"
-#elif defined(STD_RANGES)
-#include "STDRangesStream.hpp"
-#elif defined(TBB)
-#include "TBBStream.hpp"
-#elif defined(THRUST)
-#include "ThrustStream.h"
-#elif defined(HIP)
-#include "HIPStream.h"
-#elif defined(HC)
-#include "HCStream.h"
-#elif defined(OCL)
-#include "OCLStream.h"
-#elif defined(USE_RAJA)
-#include "RAJAStream.hpp"
-#elif defined(KOKKOS)
-#include "KokkosStream.hpp"
-#elif defined(ACC)
-#include "ACCStream.h"
-#elif defined(SYCL)
-#include "SYCLStream.h"
-#elif defined(SYCL2020)
-#include "SYCLStream2020.h"
-#elif defined(OMP)
-#include "OMPStream.h"
-#elif defined(FUTHARK)
-#include "FutharkStream.h"
-#endif
+#include "StreamModels.h"
+#include "Unit.h"
 
 // Default size of 2^25
 int ARRAY_SIZE = 33554432;
-unsigned int num_times = 100;
-unsigned int deviceIndex = 0;
+size_t num_times = 100;
+size_t deviceIndex = 0;
 bool use_float = false;
 bool output_as_csv = false;
-bool mibibytes = false;
+// Default unit of memory is MegaBytes (as per STREAM) 
+Unit unit{Unit::Kind::MegaByte};
+bool silence_errors = false;
 std::string csv_separator = ",";
 
-template <typename T>
-void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c, T& sum);
+// Benchmark Identifier: identifies individual & groups of benchmarks:
+// - Classic: 5 classic kernels: Copy, Mul, Add, Triad, Dot.
+// - All: all kernels.
+// - Individual kernels only.
+enum class BenchId : int {Copy, Mul, Add, Triad, Nstream, Dot, Classic, All};
+
+struct Benchmark {
+  BenchId id;
+  char const* label;
+  // Weight counts data elements of original arrays moved each loop iteration - used to calculate achieved BW:
+  // bytes = weight * sizeof(T) * ARRAY_SIZE -> bw = bytes / dur
+  size_t weight;
+  // Is it one of: Copy, Mul, Add, Triad, Dot?
+  bool classic = false;
+};
+
+// Benchmarks in the order in which - if present - should be run for validation purposes:
+constexpr size_t num_benchmarks = 6;
+std::array<Benchmark, num_benchmarks> bench = {
+  Benchmark { .id = BenchId::Copy,    .label = "Copy",    .weight = 2, .classic = true  },
+  Benchmark { .id = BenchId::Mul,     .label = "Mul",     .weight = 2, .classic = true  },
+  Benchmark { .id = BenchId::Add,     .label = "Add",     .weight = 3, .classic = true  },
+  Benchmark { .id = BenchId::Triad,   .label = "Triad",   .weight = 3, .classic = true  },
+  Benchmark { .id = BenchId::Dot,     .label = "Dot",     .weight = 2, .classic = true  },
+  Benchmark { .id = BenchId::Nstream, .label = "Nstream", .weight = 4, .classic = false }
+};
+
+// Selected benchmarks to run: default is all 5 classic benchmarks.
+BenchId selection = BenchId::Classic;
+
+// Returns true if the benchmark needs to be run:
+bool run_benchmark(Benchmark const& b) {
+  if (selection == BenchId::All)                  return true;
+  if (selection == BenchId::Classic && b.classic) return true;
+  return selection == b.id;
+}
+
+// Benchmark run order
+// - Classic: runs each bench once in the order above, and repeats n times.
+// - Isolated: runs each bench n times in isolation
+enum class BenchOrder : int {Classic, Isolated};
+BenchOrder order = BenchOrder::Classic;
 
 template <typename T>
 void run();
 
-// Options for running the benchmark:
-// - All 5 kernels (Copy, Add, Mul, Triad, Dot).
-// - Triad only.
-// - Nstream only.
-enum class Benchmark {All, Triad, Nstream};
-
-// Selected run options.
-Benchmark selection = Benchmark::All;
-
 void parseArguments(int argc, char *argv[]);
 
 int main(int argc, char *argv[])
 {
-
   parseArguments(argc, argv);
 
   if (!output_as_csv)
@@ -95,106 +95,86 @@ int main(int argc, char *argv[])
   if (use_float)
     run<float>();
   else
-    run<double>(); 
+    run<double>();
 
+  return EXIT_SUCCESS;
 }
 
-
-// Run the 5 main kernels
-template <typename T>
-std::vector<std::vector<double>> run_all(Stream<T> *stream, T& sum)
-{
-
-  // List of times
-  std::vector<std::vector<double>> timings(5);
-
-  // Declare timers
-  std::chrono::high_resolution_clock::time_point t1, t2;
-
-  // Main loop
-  for (unsigned int k = 0; k < num_times; k++)
-  {
-    // Execute Copy
-    t1 = std::chrono::high_resolution_clock::now();
-    stream->copy();
-    t2 = std::chrono::high_resolution_clock::now();
-    timings[0].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
-
-    // Execute Mul
-    t1 = std::chrono::high_resolution_clock::now();
-    stream->mul();
-    t2 = std::chrono::high_resolution_clock::now();
-    timings[1].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
-
-    // Execute Add
-    t1 = std::chrono::high_resolution_clock::now();
-    stream->add();
-    t2 = std::chrono::high_resolution_clock::now();
-    timings[2].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
-
-    // Execute Triad
-    t1 = std::chrono::high_resolution_clock::now();
-    stream->triad();
-    t2 = std::chrono::high_resolution_clock::now();
-    timings[3].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
-
-    // Execute Dot
-    t1 = std::chrono::high_resolution_clock::now();
-    sum = stream->dot();
-    t2 = std::chrono::high_resolution_clock::now();
-    timings[4].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
-
-  }
-
-  // Compiler should use a move
-  return timings;
+// Returns duration of executing function f:
+template <typename F>
+double time(F&& f) {
+  using clk_t = std::chrono::high_resolution_clock;
+  using dur_t = std::chrono::duration<double>;
+  auto start = clk_t::now();
+  f();
+  return dur_t(clk_t::now() - start).count();
 }
 
-// Run the Triad kernel
+// Run specified kernels
 template <typename T>
-std::vector<std::vector<double>> run_triad(Stream<T> *stream)
+std::vector<std::vector<double>> run_all(std::unique_ptr<Stream<T>>& stream, T& sum)
 {
+  // Times for each measured benchmark:
+  std::vector<std::vector<double>> timings(num_benchmarks);
 
-  std::vector<std::vector<double>> timings(1);
-
-  // Declare timers
-  std::chrono::high_resolution_clock::time_point t1, t2;
-
-  // Run triad in loop
-  t1 = std::chrono::high_resolution_clock::now();
-  for (unsigned int k = 0; k < num_times; k++)
+  // Run a particular benchmark
+  auto run = [&](Benchmark const& b)
   {
-    stream->triad();
-  }
-  t2 = std::chrono::high_resolution_clock::now();
-
-  double runtime = std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count();
-  timings[0].push_back(runtime);
-
-  return timings;
-}
+    switch(b.id) {
+    case BenchId::Copy:    return stream->copy();
+    case BenchId::Mul:     return stream->mul();
+    case BenchId::Add:     return stream->add();
+    case BenchId::Triad:   return stream->triad();
+    case BenchId::Dot:     sum = stream->dot(); return;
+    case BenchId::Nstream: return stream->nstream();
+    default:
+      std::cerr << "Unimplemented benchmark: " << b.label << std::endl;
+      abort();
+    }
+  };
 
-// Run the Nstream kernel
-template <typename T>
-std::vector<std::vector<double>> run_nstream(Stream<T> *stream)
-{
-  std::vector<std::vector<double>> timings(1);
+  // Time a particular benchmark:
+  auto dt = [&](Benchmark const& b) { return time([&] { run(b); }); };
 
-  // Declare timers
-  std::chrono::high_resolution_clock::time_point t1, t2;
+  // Reserve timings:
+  for (size_t i = 0; i < num_benchmarks; ++i) {
+    if (!run_benchmark(bench[i])) continue;
+    timings[i].reserve(num_times);
+  }
 
-  // Run nstream in loop
-  for (int k = 0; k < num_times; k++) {
-    t1 = std::chrono::high_resolution_clock::now();
-    stream->nstream();
-    t2 = std::chrono::high_resolution_clock::now();
-    timings[0].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
+  switch(order) {
+  // Classic runs each benchmark once in the order specifies in the "bench" array above,
+  // and then repeats num_times:
+  case BenchOrder::Classic: {
+    for (size_t k = 0; k < num_times; k++) {
+      for (size_t i = 0; i < num_benchmarks; ++i) {
+	if (!run_benchmark(bench[i])) continue;
+	timings[i].push_back(dt(bench[i]));
+      }
+    }
+    break;
+  }
+  // Isolated runs each benchmark num_times, before proceeding to run the next benchmark:
+  case BenchOrder::Isolated: {
+    for (size_t i = 0; i < num_benchmarks; ++i) {
+      if (!run_benchmark(bench[i])) continue;
+      auto t = time([&] { for (size_t k = 0; k < num_times; k++) run(bench[i]); });
+      timings[i].resize(num_times, t / (double)num_times);
+    }
+    break;
+  }
+  default:
+    std::cerr << "Unimplemented order" << std::endl;
+    abort();
   }
 
+  // Compiler should use a move
   return timings;
-
 }
 
+template <typename T>
+void check_solution(const size_t ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c,
+		    T& sum);
 
 // Generic run routine
 // Runs the kernel(s) and prints output.
@@ -203,206 +183,110 @@ void run()
 {
   std::streamsize ss = std::cout.precision();
 
+  // Formatting utilities:
+  auto fmt_bw = [&](size_t weight, double dt) {
+    return unit.fmt((weight * sizeof(T) * ARRAY_SIZE)/dt);
+  };
+  auto fmt_csv_header = [] {
+    std::cout
+      << "function" << csv_separator
+      << "num_times" << csv_separator
+      << "n_elements" << csv_separator
+      << "sizeof" << csv_separator
+      << "max_" << unit.str() << "_per_sec" << csv_separator
+      << "min_runtime" << csv_separator
+      << "max_runtime" << csv_separator
+      << "avg_runtime" << std::endl;
+  };
+  auto fmt_csv = [](char const* function, size_t num_times, size_t num_elements,
+                    size_t type_size, double bandwidth,
+		    double dt_min, double dt_max, double dt_avg) {
+    std::cout << function << csv_separator
+         << num_times << csv_separator
+         << num_elements << csv_separator
+         << type_size << csv_separator
+         << bandwidth << csv_separator
+         << dt_min << csv_separator
+         << dt_max << csv_separator
+         << dt_avg << std::endl;
+  };
+  auto fmt_cli = [](char const* function, double bandwidth,
+		    double dt_min, double dt_max, double dt_avg) {
+    std::cout
+      << std::left << std::setw(12) << function
+      << std::left << std::setw(12) << std::setprecision(3) << bandwidth
+      << std::left << std::setw(12) << std::setprecision(5) << dt_min
+      << std::left << std::setw(12) << std::setprecision(5) << dt_max
+      << std::left << std::setw(12) << std::setprecision(5) << dt_avg
+      << std::endl;
+  };
+  auto fmt_result = [&](char const* function, size_t num_times, size_t num_elements,
+                        size_t type_size, double bandwidth,
+			double dt_min, double dt_max, double dt_avg) {
+    if (!output_as_csv) return fmt_cli(function, bandwidth, dt_min, dt_max, dt_avg);
+    fmt_csv(function, num_times, num_elements, type_size, bandwidth, dt_min, dt_max, dt_avg);
+  };
+
   if (!output_as_csv)
   {
-    if (selection == Benchmark::All)
-      std::cout << "Running kernels " << num_times << " times" << std::endl;
-    else if (selection == Benchmark::Triad)
-    {
-      std::cout << "Running triad " << num_times << " times" << std::endl;
-      std::cout << "Number of elements: " << ARRAY_SIZE << std::endl;
-    }
-
-
-    if (sizeof(T) == sizeof(float))
-      std::cout << "Precision: float" << std::endl;
-    else
-      std::cout << "Precision: double" << std::endl;
-
-
-    if (mibibytes)
-    {
-      // MiB = 2^20
-      std::cout << std::setprecision(1) << std::fixed
-                << "Array size: " << ARRAY_SIZE*sizeof(T)*std::pow(2.0, -20.0) << " MiB"
-                << " (=" << ARRAY_SIZE*sizeof(T)*std::pow(2.0, -30.0) << " GiB)" << std::endl;
-      std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*std::pow(2.0, -20.0) << " MiB"
-                << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*std::pow(2.0, -30.0) << " GiB)" << std::endl;
-    }
-    else
-    {
-      // MB = 10^6
-      std::cout << std::setprecision(1) << std::fixed
-                << "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB"
-                << " (=" << ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl;
-      std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB"
-                << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl;
+    std::cout << "Running ";
+    switch(selection) {
+    case BenchId::All: std::cout << " All kernels "; break;
+    case BenchId::Classic: std::cout << " Classic kernels "; break;
+    default:
+      std::cout << "Running ";
+      for (size_t i = 0; i < num_benchmarks; ++i) {
+	if (selection == bench[i].id) {
+	  std::cout << bench[i].label;
+	  break;
+	}
+      }
+      std::cout << " ";
     }
+    std::cout << num_times << " times" << std::endl;
+    std::cout << "Number of elements: " << ARRAY_SIZE << std::endl;
+    std::cout << "Precision: " << (sizeof(T) == sizeof(float)? "float" : "double") << std::endl;
+
+    size_t nbytes = ARRAY_SIZE * sizeof(T);
+    std::cout << std::setprecision(1) << std::fixed
+	      << "Array size: " << unit.fmt(nbytes) << " " << unit.str() << std::endl;
+    std::cout << "Total size: " << unit.fmt(3.0*nbytes) << " " << unit.str() << std::endl;
     std::cout.precision(ss);
-
   }
 
-  Stream<T> *stream;
-
-#if defined(CUDA)
-  // Use the CUDA implementation
-  stream = new CUDAStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(HIP)
-  // Use the HIP implementation
-  stream = new HIPStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(HC)
-  // Use the HC implementation
-  stream = new HCStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(OCL)
-  // Use the OpenCL implementation
-  stream = new OCLStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(USE_RAJA)
-  // Use the RAJA implementation
-  stream = new RAJAStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(KOKKOS)
-  // Use the Kokkos implementation
-  stream = new KokkosStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(STD_DATA)
-  // Use the C++ STD data-oriented implementation
-  stream = new STDDataStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(STD_INDICES)
-  // Use the C++ STD index-oriented implementation
-  stream = new STDIndicesStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(STD_RANGES)
-  // Use the C++ STD ranges implementation
-  stream = new STDRangesStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(TBB)
-  // Use the C++20 implementation
-  stream = new TBBStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(THRUST)
-  // Use the Thrust implementation
-  stream = new ThrustStream<T>(ARRAY_SIZE, deviceIndex); 
-
-#elif defined(ACC)
-  // Use the OpenACC implementation
-  stream = new ACCStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(SYCL) || defined(SYCL2020)
-  // Use the SYCL implementation
-  stream = new SYCLStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(OMP)
-  // Use the OpenMP implementation
-  stream = new OMPStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(FUTHARK)
-  // Use the Futhark implementation
-  stream = new FutharkStream<T>(ARRAY_SIZE, deviceIndex);
-
-#endif
-
-  auto init1 = std::chrono::high_resolution_clock::now();
-  stream->init_arrays(startA, startB, startC);
-  auto init2 = std::chrono::high_resolution_clock::now();
+  std::unique_ptr<Stream<T>> stream = make_stream<T>(ARRAY_SIZE, deviceIndex);
+  auto initElapsedS = time([&] { stream->init_arrays(startA, startB, startC); });
 
   // Result of the Dot kernel, if used.
   T sum{};
+  std::vector<std::vector<double>> timings = run_all<T>(stream, sum);
 
-  std::vector<std::vector<double>> timings;
-
-  switch (selection)
-  {
-    case Benchmark::All:
-      timings = run_all<T>(stream, sum);
-      break;
-    case Benchmark::Triad:
-      timings = run_triad<T>(stream);
-      break;
-    case Benchmark::Nstream:
-      timings = run_nstream<T>(stream);
-      break;
-  };
-
-  // Check solutions
-  // Create host vectors
-  std::vector<T> a(ARRAY_SIZE);
-  std::vector<T> b(ARRAY_SIZE);
-  std::vector<T> c(ARRAY_SIZE);
-
-
-  auto read1 = std::chrono::high_resolution_clock::now();
-  stream->read_arrays(a, b, c);
-  auto read2 = std::chrono::high_resolution_clock::now();
+  // Create & read host vectors:
+  std::vector<T> a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE);
+  auto readElapsedS = time([&] { stream->read_arrays(a, b, c); });
 
-  auto initElapsedS = std::chrono::duration_cast<std::chrono::duration<double>>(read2 - read1).count();
-  auto readElapsedS = std::chrono::duration_cast<std::chrono::duration<double>>(init2 - init1).count();
-  auto initBWps = ((mibibytes ? std::pow(2.0, -20.0) : 1.0E-6) * (3 * sizeof(T) * ARRAY_SIZE)) / initElapsedS;
-  auto readBWps = ((mibibytes ? std::pow(2.0, -20.0) : 1.0E-6) * (3 * sizeof(T) * ARRAY_SIZE)) / readElapsedS;
+  check_solution<T>(num_times, a, b, c, sum);
+  auto initBWps = fmt_bw(3, initElapsedS);
+  auto readBWps = fmt_bw(3, readElapsedS);
 
   if (output_as_csv)
   {
-    std::cout
-      << "phase" << csv_separator
-      << "n_elements" << csv_separator
-      << "sizeof" << csv_separator
-      << ((mibibytes) ? "max_mibytes_per_sec" : "max_mbytes_per_sec") << csv_separator
-      << "runtime" << std::endl;
-    std::cout
-      << "Init" << csv_separator
-      << ARRAY_SIZE << csv_separator
-      << sizeof(T) << csv_separator
-      << initBWps << csv_separator
-      << initElapsedS << std::endl;
-    std::cout
-      << "Read" << csv_separator
-      << ARRAY_SIZE << csv_separator
-      << sizeof(T) << csv_separator
-      << readBWps << csv_separator
-      << readElapsedS << std::endl;
+    fmt_csv_header();
+    fmt_csv("Init", 1, ARRAY_SIZE, sizeof(T), initBWps, initElapsedS, initElapsedS, initElapsedS);
+    fmt_csv("Read", 1, ARRAY_SIZE, sizeof(T), readBWps, readElapsedS, readElapsedS, readElapsedS);
   }
   else
   {
     std::cout << "Init: "
       << std::setw(7)
-      << initElapsedS
-      << " s (="
-      << initBWps
-      << (mibibytes ? " MiBytes/sec" : " MBytes/sec")
-      << ")" << std::endl;
+      << initElapsedS << " s (=" << initBWps << " " << unit.str() << "/s" << ")" << std::endl;
     std::cout << "Read: "
       << std::setw(7)
-      << readElapsedS
-      << " s (="
-      << readBWps
-      << (mibibytes ? " MiBytes/sec" : " MBytes/sec")
-      << ")" << std::endl;
-  }
+      << readElapsedS << " s (=" << readBWps << " " << unit.str() << "/s" << ")" << std::endl;
 
-  check_solution<T>(num_times, a, b, c, sum);
-
-  // Display timing results
-  if (output_as_csv)
-  {
-    std::cout
-      << "function" << csv_separator
-      << "num_times" << csv_separator
-      << "n_elements" << csv_separator
-      << "sizeof" << csv_separator
-      << ((mibibytes) ? "max_mibytes_per_sec" : "max_mbytes_per_sec") << csv_separator
-      << "min_runtime" << csv_separator
-      << "max_runtime" << csv_separator
-      << "avg_runtime" << std::endl;
-  }
-  else
-  {
     std::cout
       << std::left << std::setw(12) << "Function"
-      << std::left << std::setw(12) << ((mibibytes) ? "MiBytes/sec" : "MBytes/sec")
+      << std::left << std::setw(12) << (std::string(unit.str()) + "/s")
       << std::left << std::setw(12) << "Min (sec)"
       << std::left << std::setw(12) << "Max"
       << std::left << std::setw(12) << "Average"
@@ -410,186 +294,135 @@ void run()
       << std::fixed;
   }
 
-
-  if (selection == Benchmark::All || selection == Benchmark::Nstream)
+  for (size_t i = 0; i < num_benchmarks; ++i)
   {
+    if (!run_benchmark(bench[i])) continue;
 
-    std::vector<std::string> labels;
-    std::vector<size_t> sizes;
+    // Get min/max; ignore the first result
+    auto minmax = std::minmax_element(timings[i].begin()+1, timings[i].end());
 
-    if (selection == Benchmark::All)
-    {
-      labels = {"Copy", "Mul", "Add", "Triad", "Dot"};
-      sizes = {
-        2 * sizeof(T) * ARRAY_SIZE,
-        2 * sizeof(T) * ARRAY_SIZE,
-        3 * sizeof(T) * ARRAY_SIZE,
-        3 * sizeof(T) * ARRAY_SIZE,
-        2 * sizeof(T) * ARRAY_SIZE};
-    } else if (selection == Benchmark::Nstream)
-    {
-      labels = {"Nstream"};
-      sizes = {4 * sizeof(T) * ARRAY_SIZE };
-    }
+    // Calculate average; ignore the first result
+    double average = std::accumulate(timings[i].begin()+1, timings[i].end(), 0.0)
+      / (double)(num_times - 1);
 
-    for (int i = 0; i < timings.size(); ++i)
-    {
-      // Get min/max; ignore the first result
-      auto minmax = std::minmax_element(timings[i].begin()+1, timings[i].end());
-
-      // Calculate average; ignore the first result
-      double average = std::accumulate(timings[i].begin()+1, timings[i].end(), 0.0) / (double)(num_times - 1);
-
-      // Display results
-      if (output_as_csv)
-      {
-        std::cout
-          << labels[i] << csv_separator
-          << num_times << csv_separator
-          << ARRAY_SIZE << csv_separator
-          << sizeof(T) << csv_separator
-          << ((mibibytes) ? std::pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first) << csv_separator
-          << *minmax.first << csv_separator
-          << *minmax.second << csv_separator
-          << average
-          << std::endl;
-      }
-      else
-      {
-        std::cout
-          << std::left << std::setw(12) << labels[i]
-          << std::left << std::setw(12) << std::setprecision(3) << 
-            ((mibibytes) ? std::pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first)
-          << std::left << std::setw(12) << std::setprecision(5) << *minmax.first
-          << std::left << std::setw(12) << std::setprecision(5) << *minmax.second
-          << std::left << std::setw(12) << std::setprecision(5) << average
-          << std::endl;
-      }
-    }
-  } else if (selection == Benchmark::Triad)
-  {
-    // Display timing results
-    double total_bytes = 3 * sizeof(T) * ARRAY_SIZE * num_times;
-    double bandwidth = ((mibibytes) ? std::pow(2.0, -30.0) : 1.0E-9) * (total_bytes / timings[0][0]);
-
-    if (output_as_csv)
-    {
-      std::cout
-        << "function" << csv_separator
-        << "num_times" << csv_separator
-        << "n_elements" << csv_separator
-        << "sizeof" << csv_separator
-        << ((mibibytes) ? "gibytes_per_sec" : "gbytes_per_sec") << csv_separator
-        << "runtime"
-        << std::endl;
-      std::cout
-        << "Triad" << csv_separator
-        << num_times << csv_separator
-        << ARRAY_SIZE << csv_separator
-        << sizeof(T) << csv_separator
-        << bandwidth << csv_separator
-        << timings[0][0]
-        << std::endl;
-    }
-    else
-    {
-      std::cout
-        << "--------------------------------"
-        << std::endl << std::fixed
-        << "Runtime (seconds): " << std::left << std::setprecision(5)
-        << timings[0][0] << std::endl
-        << "Bandwidth (" << ((mibibytes) ? "GiB/s" : "GB/s") << "):  "
-        << std::left << std::setprecision(3)
-        << bandwidth << std::endl;
-    }
+    // Display results
+    fmt_result(bench[i].label, num_times, ARRAY_SIZE, sizeof(T),
+	       fmt_bw(bench[i].weight, *minmax.first), *minmax.first, *minmax.second, average);
   }
-
-  delete stream;
-
 }
 
-
 template <typename T>
-void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c, T& sum)
+void check_solution(const size_t num_times,
+		    std::vector<T>& a, std::vector<T>& b, std::vector<T>& c, T& sum)
 {
   // Generate correct solution
   T goldA = startA;
   T goldB = startB;
   T goldC = startC;
-  T goldSum{};
+  T goldS = T(0.);
 
   const T scalar = startScalar;
 
-  for (unsigned int i = 0; i < ntimes; i++)
-  {
-    // Do STREAM!
-    if (selection == Benchmark::All)
-    {
-      goldC = goldA;
-      goldB = scalar * goldC;
-      goldC = goldA + goldB;
-      goldA = goldB + scalar * goldC;
-    } else if (selection == Benchmark::Triad)
-    {
-      goldA = goldB + scalar * goldC;
-    } else if (selection == Benchmark::Nstream)
-    {
-      goldA += goldB + scalar * goldC;
+  // Updates output due to running each benchmark:
+  auto run = [&](int b) {
+    switch(bench[b].id) {
+    case BenchId::Copy:    goldC = goldA; break;
+    case BenchId::Mul:     goldB = scalar * goldC; break;
+    case BenchId::Add:     goldC = goldA + goldB; break;
+    case BenchId::Triad:   goldA = goldB + scalar * goldC; break;
+    case BenchId::Nstream: goldA += goldB + scalar * goldC; break;
+    case BenchId::Dot:     goldS = goldA * goldB * T(ARRAY_SIZE); break; // This calculates the answer exactly
+    default:
+    std::cerr << "Unimplemented Check: " << bench[b].label << std::endl;
+    abort();
+    }
+  };
+
+  switch(order) {
+  // Classic runs each benchmark once in the order specifies in the "bench" array above,
+  // and then repeats num_times:
+  case BenchOrder::Classic: {
+    for (size_t k = 0; k < num_times; k++) {
+      for (size_t i = 0; i < num_benchmarks; ++i) {
+	      if (!run_benchmark(bench[i])) continue;
+	      run(i);
+      }
     }
+    break;
+  }
+  // Isolated runs each benchmark num_times, before proceeding to run the next benchmark:
+  case BenchOrder::Isolated: {
+    for (size_t i = 0; i < num_benchmarks; ++i) {
+      if (!run_benchmark(bench[i])) continue;
+      for (size_t k = 0; k < num_times; k++) run(i);
+    }
+    break;
+  }
+  default:
+    std::cerr << "Unimplemented order" << std::endl;
+    abort();
   }
 
-  // Do the reduction
-  goldSum = goldA * goldB * ARRAY_SIZE;
+  // Error relative tolerance check
+  size_t failed = 0;
+  T epsi = std::numeric_limits<T>::epsilon() * T(100000.0);
+  auto check = [&](const char* name, T is, T should, T e, size_t i = size_t(-1)) {
+    if (e > epsi) {
+      ++failed;
+      if (failed > 10) return;
+      std::cerr << "FAILED validation of " << name;
+      if (i != size_t(-1)) std::cerr << "[" << i << "]";
+      std::cerr << ": " << is << " != " << should
+		<< ", relative error=" << e << " > " << epsi << std::endl;
+    }
+  };
 
-  // Calculate the average error
-  long double errA = std::accumulate(a.begin(), a.end(), T{}, [&](double sum, const T val){ return sum + std::fabs(val - goldA); });
-  errA /= a.size();
-  long double errB = std::accumulate(b.begin(), b.end(), T{}, [&](double sum, const T val){ return sum + std::fabs(val - goldB); });
-  errB /= b.size();
-  long double errC = std::accumulate(c.begin(), c.end(), T{}, [&](double sum, const T val){ return sum + std::fabs(val - goldC); });
-  errC /= c.size();
-  long double errSum = std::fabs((sum - goldSum)/goldSum);
+  // Sum
+  T eS = std::fabs(sum - goldS) / std::fabs(goldS);
+  for (size_t i = 0; i < num_benchmarks; ++i) {
+    if (bench[i].id != BenchId::Dot) continue;
+    if (run_benchmark(bench[i]))
+      check("sum", sum,  goldS, eS);
+    break;
+  }
 
-  long double epsi = std::numeric_limits<T>::epsilon() * 100.0;
+  // Calculate the L^infty-norm relative error
+  for (size_t i = 0; i < a.size(); ++i) {
+    T vA = a[i], vB = b[i], vC = c[i];
+    T eA = std::fabs(vA - goldA) / std::fabs(goldA);
+    T eB = std::fabs(vB - goldB) / std::fabs(goldB);
+    T eC = std::fabs(vC - goldC) / std::fabs(goldC);
 
-  if (errA > epsi)
-    std::cerr
-      << "Validation failed on a[]. Average error " << errA
-      << std::endl;
-  if (errB > epsi)
-    std::cerr
-      << "Validation failed on b[]. Average error " << errB
-      << std::endl;
-  if (errC > epsi)
-    std::cerr
-      << "Validation failed on c[]. Average error " << errC
-      << std::endl;
-  // Check sum to 8 decimal places
-  if (selection == Benchmark::All && errSum > 1.0E-8)
-    std::cerr
-      << "Validation failed on sum. Error " << errSum
-      << std::endl << std::setprecision(15)
-      << "Sum was " << sum << " but should be " << goldSum
-      << std::endl;
+    check("a", a[i], goldA, eA, i);
+    check("b", b[i], goldB, eB, i);
+    check("c", c[i], goldC, eC, i);
+  }
 
+  if (failed > 0 && !silence_errors)
+    std::exit(EXIT_FAILURE);
 }
 
-int parseUInt(const char *str, unsigned int *output)
+void parseArguments(int argc, char *argv[])
 {
-  char *next;
-  *output = strtoul(str, &next, 10);
-  return !strlen(next);
-}
+  auto parseUInt =[](const char *str, size_t *output) {
+    char *next;
+    *output = strtoull(str, &next, 10);
+    return !strlen(next);
+  };
+  auto parseInt = [](const char *str, intptr_t *output) {
+    char *next;
+    *output = strtoll(str, &next, 10);
+    return !strlen(next);
+  };
 
-int parseInt(const char *str, int *output)
-{
-  char *next;
-  *output = strtol(str, &next, 10);
-  return !strlen(next);
-}
+  // Prints all available benchmark labels:
+  auto print_labels = [&](auto& os) {
+    for (size_t i = 0; i < num_benchmarks; ++i) {
+      os << bench[i].label;
+      if (i != (num_benchmarks - 1)) os << ",";
+    }
+  };
 
-void parseArguments(int argc, char *argv[])
-{
   for (int i = 1; i < argc; i++)
   {
     if (!std::string("--list").compare(argv[i]))
@@ -608,11 +441,13 @@ void parseArguments(int argc, char *argv[])
     else if (!std::string("--arraysize").compare(argv[i]) ||
              !std::string("-s").compare(argv[i]))
     {
-      if (++i >= argc || !parseInt(argv[i], &ARRAY_SIZE) || ARRAY_SIZE <= 0)
+      intptr_t array_size;
+      if (++i >= argc || !parseInt(argv[i], &array_size) || array_size <= 0)
       {
         std::cerr << "Invalid array size." << std::endl;
-        exit(EXIT_FAILURE);
+        std::exit(EXIT_FAILURE);
       }
+      ARRAY_SIZE = array_size;
     }
     else if (!std::string("--numtimes").compare(argv[i]) ||
              !std::string("-n").compare(argv[i]))
@@ -632,13 +467,57 @@ void parseArguments(int argc, char *argv[])
     {
       use_float = true;
     }
-    else if (!std::string("--triad-only").compare(argv[i]))
+    else if (!std::string("--print-names").compare(argv[i]))
+    {
+      std::cout << "Available benchmarks: ";
+      print_labels(std::cout);
+      std::cout << std::endl;
+      std::exit(EXIT_SUCCESS);
+    }
+    else if (!std::string("--only").compare(argv[i]) || !std::string("-o").compare(argv[i]))
     {
-      selection = Benchmark::Triad;
+      if (++i >= argc)
+      {
+        std::cerr << "Expected benchmark name after --only" << std::endl;
+        std::exit(EXIT_FAILURE);
+      }
+      auto key = std::string(argv[i]);
+      if (key == "Classic")
+      {
+        selection = BenchId::Classic;
+      }
+      else if (key == "All")
+      {
+        selection = BenchId::All;
+      }
+      else
+      {
+        auto p = std::find_if(bench.begin(), bench.end(), [&](Benchmark const& b) {
+          return std::string(b.label) == key;
+        });
+        if (p == bench.end()) {
+          std::cerr << "Unknown benchmark name \"" << argv[i] << "\" after --only" << std::endl;
+          std::cerr << "Available benchmarks: All, Classic,";
+          print_labels(std::cerr);
+          std::cerr << std::endl;
+          std::exit(EXIT_FAILURE);
+        }
+        selection = p->id;
+      }
     }
-    else if (!std::string("--nstream-only").compare(argv[i]))
+    else if (!std::string("--order").compare(argv[i]))
     {
-      selection = Benchmark::Nstream;
+      if (++i >= argc)
+      {
+       std::cerr << "Expected benchmark order after --order. Options: \"classic\" (default), \"isolated\"."
+		  << std::endl;
+        exit(EXIT_FAILURE);
+      }
+      auto key = std::string(argv[i]);
+      if (key == "isolated")
+      {
+        order = BenchOrder::Isolated;
+      }
     }
     else if (!std::string("--csv").compare(argv[i]))
     {
@@ -646,7 +525,31 @@ void parseArguments(int argc, char *argv[])
     }
     else if (!std::string("--mibibytes").compare(argv[i]))
     {
-      mibibytes = true;
+      unit = Unit(Unit::Kind::MibiByte);
+    }
+    else if (!std::string("--megabytes").compare(argv[i]))
+    {
+      unit = Unit(Unit::Kind::MegaByte);
+    }
+    else if (!std::string("--gibibytes").compare(argv[i]))
+    {
+      unit = Unit(Unit::Kind::GibiByte);
+    }
+    else if (!std::string("--gigabytes").compare(argv[i]))
+    {
+      unit = Unit(Unit::Kind::GigaByte);
+    }
+    else if (!std::string("--tebibytes").compare(argv[i]))
+    {
+      unit = Unit(Unit::Kind::TebiByte);
+    }
+    else if (!std::string("--terabytes").compare(argv[i]))
+    {
+      unit = Unit(Unit::Kind::TeraByte);
+    }
+    else if (!std::string("--silence-errors").compare(argv[i]))
+    {
+      silence_errors = true;
     }
     else if (!std::string("--help").compare(argv[i]) ||
              !std::string("-h").compare(argv[i]))
@@ -660,18 +563,25 @@ void parseArguments(int argc, char *argv[])
       std::cout << "  -s  --arraysize  SIZE    Use SIZE elements in the array" << std::endl;
       std::cout << "  -n  --numtimes   NUM     Run the test NUM times (NUM >= 2)" << std::endl;
       std::cout << "      --float              Use floats (rather than doubles)" << std::endl;
-      std::cout << "      --triad-only         Only run triad" << std::endl;
-      std::cout << "      --nstream-only       Only run nstream" << std::endl;
+      std::cout << "  -o  --only       NAME    Only run one benchmark (see --print-names)" << std::endl;
+      std::cout << "      --print-names        Prints all available benchmark names" << std::endl;
+      std::cout << "      --order              Benchmark run order: \"classic\" (default) or \"isolated\"." << std::endl;
       std::cout << "      --csv                Output as csv table" << std::endl;
+      std::cout << "      --megabytes          Use MB=10^6 for bandwidth calculation (default)" << std::endl;
       std::cout << "      --mibibytes          Use MiB=2^20 for bandwidth calculation (default MB=10^6)" << std::endl;
+      std::cout << "      --gibibytes          Use GiB=2^30 for bandwidth calculation (default MB=10^6)" << std::endl;
+      std::cout << "      --gigabytes          Use GB=10^9 for bandwidth calculation (default MB=10^6)" << std::endl;
+      std::cout << "      --tebibytes          Use TiB=2^40 for bandwidth calculation (default MB=10^6)" << std::endl;
+      std::cout << "      --terabytes          Use TB=10^12 for bandwidth calculation (default MB=10^6)" << std::endl;
+      std::cout << "      --silence-errors     Ignores validation errors." << std::endl;
       std::cout << std::endl;
-      exit(EXIT_SUCCESS);
+      std::exit(EXIT_SUCCESS);
     }
     else
     {
       std::cerr << "Unrecognized argument '" << argv[i] << "' (try '--help')"
                 << std::endl;
-      exit(EXIT_FAILURE);
+      std::exit(EXIT_FAILURE);
     }
   }
 }
diff --git a/src/omp/model.cmake b/src/omp/model.cmake
index 1955ebc8..56f37cf5 100644
--- a/src/omp/model.cmake
+++ b/src/omp/model.cmake
@@ -151,7 +151,7 @@ macro(setup)
         # offload but OFFLOAD_FLAGS overrides
         register_definitions(OMP_TARGET_GPU)
         separate_arguments(OFFLOAD_FLAGS)
-        list(OMP_FLAGS APPEND ${OFFLOAD_FLAGS})
+        list(APPEND OMP_FLAGS ${OFFLOAD_FLAGS})
     else ()
 
         # handle the vendor:arch value