cp2k
diff --git a/‎CMakeLists.txt
+23-12 b/‎CMakeLists.txt
+23-12
diff --git a/‎cmake/CompilerConfiguration.cmake
+5 b/‎cmake/CompilerConfiguration.cmake
+5
diff --git a/‎src/CMakeLists.txt
+25-8 b/‎src/CMakeLists.txt
+25-8
diff --git a/‎src/acc/PACKAGE
+1-1 b/‎src/acc/PACKAGE
+1-1
diff --git a/‎src/acc/acc_bench_trans.c
+19-17 b/‎src/acc/acc_bench_trans.c
+19-17
@@ -92,12 +92,16 @@ set_property(CACHE USE_SMM PROPERTY STRINGS blas libxsmm)
 
 option(USE_CUDA "Build with CUDA support" OFF)
 option(USE_HIP "Build with HIP support" OFF)
-# USE_CUDA and USE_HIP are mutually exclusive options: we either compile with
-# nvcc OR with hipcc
-if (USE_CUDA AND USE_HIP)
+option(USE_OPENCL "Build with OpenCL support" OFF)
+
+# USE_CUDA, USE_HIP, and USE_OPENCL shall be mutually exclusive options
+if ((USE_CUDA AND USE_HIP)
+    OR (USE_CUDA AND USE_OPENCL)
+    OR (USE_OPENCL AND USE_HIP))
   message(
     FATAL_ERROR
-      "USE_CUDA and USE_HIP options are mutually exclusive. Please choose one.")
+      "USE_CUDA, USE_HIP, and USE_OPENCL are mutually exclusive. Please choose one."
+  )
 endif ()
 
 set(SUPPORTED_CUDA_ARCHITECTURES K20X K40 K80 P100 V100)
@@ -117,9 +121,10 @@ enable_language(Fortran)
 
 if (WITH_C_API AND WITH_EXAMPLES)
   enable_language(CXX)
+  enable_language(C)
 endif ()
 
-# we're always using at least C++11
+# always use at least C++11
 set(CMAKE_CXX_STANDARD 11)
 
 # =================================================================================================
@@ -141,8 +146,8 @@ find_package(LAPACK REQUIRED) # needed for some of the integrated test routines,
 # environment for a python interpreter before searching elsewhere in the system.
 # In CMake <3.15, the system is searched before the virtual environment.
 if (NOT Python_EXECUTABLE)
-  # If the python interpreter isn't specified as a command line option, look for
-  # it:
+  # If the python interpreter is not specified as a command line option, look
+  # for it:
   find_package(
     Python
     COMPONENTS Interpreter
@@ -186,13 +191,22 @@ if (USE_SMM MATCHES "blas")
   message("-- Using BLAS for Small Matrix Multiplication")
 elseif (USE_SMM MATCHES "libxsmm")
   # rely on pkg-config in order to link against libxsmm
-  pkg_check_modules(deps REQUIRED IMPORTED_TARGET GLOBAL libxsmmf)
+  pkg_check_modules(LIBXSMM REQUIRED IMPORTED_TARGET GLOBAL libxsmmf)
   message("-- Using libxsmm for Small Matrix Multiplication")
 else ()
   message(FATAL_ERROR "Unknown SMM library specified")
 endif ()
 
-# =================================== GPU backend
+# =================================== GPU backends
+if (USE_OPENCL)
+  if (NOT LIBXSMM_FOUND)
+    message(FATAL_ERROR "LIBXSMM is not found but required for ACC/OpenCL.")
+  endif ()
+
+  find_package(OpenCL REQUIRED)
+  enable_language(C)
+endif ()
+
 if (USE_CUDA OR USE_HIP)
   enable_language(CXX)
   set(GPU_ARCH_NUMBER_K20X 35)
@@ -204,7 +218,6 @@ if (USE_CUDA OR USE_HIP)
 endif ()
 
 if (USE_CUDA)
-
   enable_language(CUDA)
   if (CMAKE_CUDA_COMPILER_VERSION LESS 5.5)
     message(FATAL_ERROR "CUDA version >= 5.5 is required.")
@@ -243,7 +256,6 @@ if (USE_CUDA)
   else ()
     message(STATUS "Found cuBLAS: ${CUBLAS}")
   endif ()
-
   if (WITH_CUDA_PROFILING)
     find_library(
       CUDA_NVTOOLSEXT nvToolsExt
@@ -258,7 +270,6 @@ endif ()
 # inspired from
 # https://github.com/ROCm-Developer-Tools/HIP/tree/master/samples/2_Cookbook/12_cmake_hip_add_executable
 if (USE_HIP)
-
   # Make sure the GPU required is supported
   list(FIND SUPPORTED_HIP_ARCHITECTURES ${WITH_GPU} GPU_SUPPORTED)
   if (GPU_SUPPORTED EQUAL -1)
 
@@ -88,3 +88,8 @@ Please open an issue at https://github.com/cp2k/dbcsr/issues with the reported c
   message("-- CMAKE_CXX_COMPILER_ID: " ${CMAKE_CXX_COMPILER_ID})
   message("-- CMAKE_CXX_COMPILER full path: " ${CMAKE_CXX_COMPILER})
 endif ()
+
+# inherit C flags from CXX
+set(CMAKE_C_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
+set(CMAKE_C_FLAGS_COVERAGE ${CMAKE_CXX_FLAGS_COVERAGE})
+set(CMAKE_C_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
@@ -105,6 +105,10 @@ add_fypp_sources(
   utils/dbcsr_toollib.F
   work/dbcsr_work_operations.F)
 
+set(DBCSR_OPENCL_SRCS
+    acc/opencl/acc_opencl.c acc/opencl/acc_opencl_event.c
+    acc/opencl/acc_opencl_mem.c acc/opencl/acc_opencl_stream.c)
+
 set(DBCSR_CUDA_SRCS
     acc/cuda/acc_cublas.cu
     acc/cuda/acc_cuda.cpp
@@ -141,13 +145,9 @@ add_library(dbcsr ${DBCSR_SRCS})
 set_target_properties(dbcsr PROPERTIES VERSION ${dbcsr_VERSION}
                                        SOVERSION ${dbcsr_APIVERSION})
 
-if (TARGET PkgConfig::deps)
-  target_link_libraries(dbcsr PRIVATE PkgConfig::deps)
-endif ()
-
-if (USE_SMM MATCHES "libxsmm")
-  # linker/include flags are managed by pkg-config (above)
+if (LIBXSMM_FOUND)
   target_compile_definitions(dbcsr PRIVATE __LIBXSMM)
+  target_link_libraries(dbcsr PRIVATE PkgConfig::LIBXSMM)
 endif ()
 
 if (BLAS_LIBRARIES MATCHES "mkl_")
@@ -203,6 +203,25 @@ if (OpenMP_FOUND)
   target_link_libraries(dbcsr PRIVATE OpenMP::OpenMP_Fortran)
 endif ()
 
+# =================================================================================================
+# DBCSR LIBRARY's OPENCL BACKEND
+
+if (USE_OPENCL)
+  target_compile_definitions(dbcsr PRIVATE __DBCSR_ACC)
+  target_link_libraries(dbcsr PUBLIC ${OpenCL_LIBRARY})
+
+  # OpenCL backend
+  set(DBCSR_ACC_SRCS ${DBCSR_OPENCL_SRCS})
+  add_library(acc OBJECT ${DBCSR_ACC_SRCS})
+  target_compile_definitions(acc PRIVATE __OPENCL)
+  # account for DBCSR not calling libsmm_acc_init() (DBCSR only calls acc_init)
+  target_compile_definitions(acc PRIVATE __DBCSR_ACC)
+  target_include_directories(acc PRIVATE ${OpenCL_INCLUDE_DIRS})
+  target_sources(dbcsr PRIVATE $<TARGET_OBJECTS:acc>)
+  add_subdirectory(acc/opencl/smm)
+  target_sources(dbcsr PRIVATE $<TARGET_OBJECTS:libsmm_acc>)
+endif ()
+
 # =================================================================================================
 # DBCSR LIBRARY's CUDA BACKEND
 
@@ -297,7 +316,6 @@ endif ()
 # DBCSR LIBRARY's HIP BACKEND
 
 if (USE_HIP)
-
   if (USE_OPENMP)
     set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${OpenMP_CXX_FLAGS}")
   endif ()
@@ -335,7 +353,6 @@ if (USE_HIP)
 
   target_compile_definitions(dbcsr PRIVATE __DBCSR_ACC)
   target_compile_definitions(dbcsr PRIVATE __HIP)
-
 endif ()
 
 # =================================================================================================
 
@@ -1,5 +1,5 @@
 {
 "description": "Generic accelerator API",
 "archive": "libdbcsr",
-"requires": ["../base", "cuda", "hip", "libsmm_acc"]
+"requires": ["../base", "cuda", "hip", "opencl", "libsmm_acc"]
 }
@@ -52,18 +52,20 @@ static void swap(int* m, int* n) { int tmp = *m; *m = *n; *n = tmp; }
 
 int main(int argc, char* argv[])
 {
-  const int nrepeat = (1 < argc ? atoi(argv[1]) : 5), offset = 0;
+  const int nrepeat = (1 < argc ? atoi(argv[1]) : 5);
   const int nodd = (0 < nrepeat ? ((nrepeat & 1/*odd*/) ? nrepeat : (nrepeat - 1)) : 1);
   const int stack_size = (2 < argc ? atoi(argv[2]) : 30000);
   const int m = (3 < argc ? atoi(argv[3]) : 23);
   const int n = (4 < argc ? atoi(argv[4]) : m);
+  const int offset = (5 < argc ? atoi(argv[5]) : 0);
+  const int offset_stack_size = offset + stack_size;
 #if defined(ALIGNMENT) && (0 < ALIGNMENT)
   const int mn = (int)ROUNDUP2(sizeof(ELEM_TYPE) * m, ALIGNMENT) * n / sizeof(ELEM_TYPE);
 #else
   const int mn = m * n;
 #endif
 #if defined(SHUFFLE)
-  const size_t shuffle = libxsmm_shuffle((unsigned int)stack_size);
+  const size_t shuffle = libxsmm_shuffle((unsigned int)offset_stack_size);
 #endif
 #if defined(WARMUP) && (0 < WARMUP) && !defined(_DEBUG)
   const int warmup = MAX(WARMUP, 2) / 2 * 2;
@@ -104,34 +106,34 @@ int main(int argc, char* argv[])
 #else
   CHECK(acc_stream_create(&stream, "stream", -1/*default priority*/), &result);
 #endif
-  CHECK(acc_host_mem_allocate((void**)&mat_hst, sizeof(ELEM_TYPE) * mn * stack_size, stream), &result);
-  CHECK(acc_host_mem_allocate((void**)&stack_hst, sizeof(int) * stack_size, stream), &result);
+  CHECK(acc_host_mem_allocate((void**)&mat_hst, sizeof(ELEM_TYPE) * mn * offset_stack_size, stream), &result);
+  CHECK(acc_host_mem_allocate((void**)&stack_hst, sizeof(int) * offset_stack_size, stream), &result);
   CHECK(acc_stream_sync(stream), &result); /* ensure host-data is allocated */
-  for (i = 0; i < stack_size; ++i) { /* initialize matrices */
+  for (i = 0; i < offset_stack_size; ++i) { /* initialize matrices */
     init(i/*seed*/, &mat_hst[i*mn], m, n);
   }
-  for (i = 0; i < stack_size; ++i) { /* initialize indexes */
+  for (i = 0; i < offset_stack_size; ++i) { /* initialize indexes */
 #if defined(SHUFFLE)
-    const int j = mn * (int)((shuffle * i) % stack_size);
+    const int j = mn * (int)((shuffle * i) % offset_stack_size);
 #else
     const int j = mn * i;
 #endif
     stack_hst[i] = j;
   }
-  CHECK(acc_dev_mem_allocate((void**)&mat_dev, sizeof(ELEM_TYPE) * mn * stack_size), &result);
-  CHECK(acc_dev_mem_allocate((void**)&stack_dev, sizeof(int) * stack_size), &result);
+  CHECK(acc_dev_mem_allocate((void**)&mat_dev, sizeof(ELEM_TYPE) * mn * offset_stack_size), &result);
+  CHECK(acc_dev_mem_allocate((void**)&stack_dev, sizeof(int) * offset_stack_size), &result);
 #if defined(USE_LIBXSMM)
   CHECK(acc_stream_sync(stream), &result);
   start = libxsmm_timer_tick();
 #endif
-  CHECK(acc_memcpy_h2d(mat_hst, mat_dev, sizeof(ELEM_TYPE) * mn * stack_size, stream), &result);
-  CHECK(acc_memcpy_h2d(stack_hst, stack_dev, sizeof(int) * stack_size, stream), &result);
+  CHECK(acc_memcpy_h2d(mat_hst, mat_dev, sizeof(ELEM_TYPE) * mn * offset_stack_size, stream), &result);
+  CHECK(acc_memcpy_h2d(stack_hst, stack_dev, sizeof(int) * offset_stack_size, stream), &result);
 #if defined(USE_LIBXSMM)
   CHECK(acc_stream_sync(stream), &result);
   duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
   printf("copy-in: %.1f ms %.1f GB/s\n", 1000.0 * duration,
     (sizeof(ELEM_TYPE) * mn + sizeof(int))
-      * stack_size / (duration * (1ULL << 30)));
+      * offset_stack_size / (duration * (1ULL << 30)));
 #endif
   /* warmup execution and prebuild JIT kernels */
   for (r = 0; r < warmup / 2; ++r) {
@@ -156,25 +158,25 @@ int main(int argc, char* argv[])
     assert(0 < nodd && (nodd & 1/*odd*/));
     printf("device: %.1f ms %.1f GB/s\n", 1000.0 * duration / nodd,
       (sizeof(ELEM_TYPE) * mn + sizeof(int))
-        * stack_size / (duration * (1ULL << 30) / nodd));
+        * offset_stack_size / (duration * (1ULL << 30) / nodd));
     mm = m; nn = n;
     start = libxsmm_timer_tick();
     for (r = 0; r < nodd; ++r) {
       libxsmm_itrans_batch_omp(mat_hst, sizeof(ELEM_TYPE), mm, nn, mm, nn,
-        0/*index_base*/, sizeof(int)/*index_stride*/, stack_hst, stack_size);
+        0/*index_base*/, sizeof(int)/*index_stride*/, stack_hst + offset, stack_size);
       swap(&mm, &nn);
     }
     duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
     printf("host: %.1f ms %.1f GB/s\n", 1000.0 * duration / nodd,
       (sizeof(ELEM_TYPE) * mn + sizeof(int))
-        * stack_size / (duration * (1ULL << 30) / nodd));
+        * offset_stack_size / (duration * (1ULL << 30) / nodd));
     /* transfer result from device to host for validation */
     CHECK(acc_memcpy_d2h(mat_dev, mat_hst,
-      sizeof(ELEM_TYPE) * mn * stack_size, stream), &result);
+      sizeof(ELEM_TYPE) * mn * offset_stack_size, stream), &result);
     CHECK(acc_stream_sync(stream), &result);
     if (EXIT_SUCCESS == result) {
       unsigned int nerrors = 0;
-      for (i = 0; i < stack_size; ++i) {
+      for (i = offset; i < offset_stack_size; ++i) {
         ELEM_TYPE gold[MAX_KERNEL_DIM*MAX_KERNEL_DIM];
         const ELEM_TYPE *const test = mat_hst + mn * i;
         init(i/*seed*/, gold, m, n);
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`{`
`2`	`2`	`"description": "Generic accelerator API",`
`3`	`3`	`"archive": "libdbcsr",`
`4`		`-"requires": ["../base", "cuda", "hip", "libsmm_acc"]`
	`4`	`+"requires": ["../base", "cuda", "hip", "opencl", "libsmm_acc"]`
`5`	`5`	`}`