Skip to content

Commit 0fa4776

Browse files
committed
Completed implementation with passing regtests. Included validation internal to OpenCL backend (disabled by default); useful for debugging failing tests, etc.
1 parent 75303ca commit 0fa4776

23 files changed

+2815
-38
lines changed

CMakeLists.txt

+23-12
Original file line numberDiff line numberDiff line change
@@ -92,12 +92,16 @@ set_property(CACHE USE_SMM PROPERTY STRINGS blas libxsmm)
9292

9393
option(USE_CUDA "Build with CUDA support" OFF)
9494
option(USE_HIP "Build with HIP support" OFF)
95-
# USE_CUDA and USE_HIP are mutually exclusive options: we either compile with
96-
# nvcc OR with hipcc
97-
if (USE_CUDA AND USE_HIP)
95+
option(USE_OPENCL "Build with OpenCL support" OFF)
96+
97+
# USE_CUDA, USE_HIP, and USE_OPENCL shall be mutually exclusive options
98+
if ((USE_CUDA AND USE_HIP)
99+
OR (USE_CUDA AND USE_OPENCL)
100+
OR (USE_OPENCL AND USE_HIP))
98101
message(
99102
FATAL_ERROR
100-
"USE_CUDA and USE_HIP options are mutually exclusive. Please choose one.")
103+
"USE_CUDA, USE_HIP, and USE_OPENCL are mutually exclusive. Please choose one."
104+
)
101105
endif ()
102106

103107
set(SUPPORTED_CUDA_ARCHITECTURES K20X K40 K80 P100 V100)
@@ -117,9 +121,10 @@ enable_language(Fortran)
117121

118122
if (WITH_C_API AND WITH_EXAMPLES)
119123
enable_language(CXX)
124+
enable_language(C)
120125
endif ()
121126

122-
# we're always using at least C++11
127+
# always use at least C++11
123128
set(CMAKE_CXX_STANDARD 11)
124129

125130
# =================================================================================================
@@ -141,8 +146,8 @@ find_package(LAPACK REQUIRED) # needed for some of the integrated test routines,
141146
# environment for a python interpreter before searching elsewhere in the system.
142147
# In CMake <3.15, the system is searched before the virtual environment.
143148
if (NOT Python_EXECUTABLE)
144-
# If the python interpreter isn't specified as a command line option, look for
145-
# it:
149+
# If the python interpreter is not specified as a command line option, look
150+
# for it:
146151
find_package(
147152
Python
148153
COMPONENTS Interpreter
@@ -186,13 +191,22 @@ if (USE_SMM MATCHES "blas")
186191
message("-- Using BLAS for Small Matrix Multiplication")
187192
elseif (USE_SMM MATCHES "libxsmm")
188193
# rely on pkg-config in order to link against libxsmm
189-
pkg_check_modules(deps REQUIRED IMPORTED_TARGET GLOBAL libxsmmf)
194+
pkg_check_modules(LIBXSMM REQUIRED IMPORTED_TARGET GLOBAL libxsmmf)
190195
message("-- Using libxsmm for Small Matrix Multiplication")
191196
else ()
192197
message(FATAL_ERROR "Unknown SMM library specified")
193198
endif ()
194199

195-
# =================================== GPU backend
200+
# =================================== GPU backends
201+
if (USE_OPENCL)
202+
if (NOT LIBXSMM_FOUND)
203+
message(FATAL_ERROR "LIBXSMM is not found but required for ACC/OpenCL.")
204+
endif ()
205+
206+
find_package(OpenCL REQUIRED)
207+
enable_language(C)
208+
endif ()
209+
196210
if (USE_CUDA OR USE_HIP)
197211
enable_language(CXX)
198212
set(GPU_ARCH_NUMBER_K20X 35)
@@ -204,7 +218,6 @@ if (USE_CUDA OR USE_HIP)
204218
endif ()
205219

206220
if (USE_CUDA)
207-
208221
enable_language(CUDA)
209222
if (CMAKE_CUDA_COMPILER_VERSION LESS 5.5)
210223
message(FATAL_ERROR "CUDA version >= 5.5 is required.")
@@ -243,7 +256,6 @@ if (USE_CUDA)
243256
else ()
244257
message(STATUS "Found cuBLAS: ${CUBLAS}")
245258
endif ()
246-
247259
if (WITH_CUDA_PROFILING)
248260
find_library(
249261
CUDA_NVTOOLSEXT nvToolsExt
@@ -258,7 +270,6 @@ endif ()
258270
# inspired from
259271
# https://github.com/ROCm-Developer-Tools/HIP/tree/master/samples/2_Cookbook/12_cmake_hip_add_executable
260272
if (USE_HIP)
261-
262273
# Make sure the GPU required is supported
263274
list(FIND SUPPORTED_HIP_ARCHITECTURES ${WITH_GPU} GPU_SUPPORTED)
264275
if (GPU_SUPPORTED EQUAL -1)

cmake/CompilerConfiguration.cmake

+5
Original file line numberDiff line numberDiff line change
@@ -88,3 +88,8 @@ Please open an issue at https://github.com/cp2k/dbcsr/issues with the reported c
8888
message("-- CMAKE_CXX_COMPILER_ID: " ${CMAKE_CXX_COMPILER_ID})
8989
message("-- CMAKE_CXX_COMPILER full path: " ${CMAKE_CXX_COMPILER})
9090
endif ()
91+
92+
# inherit C flags from CXX
93+
set(CMAKE_C_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
94+
set(CMAKE_C_FLAGS_COVERAGE ${CMAKE_CXX_FLAGS_COVERAGE})
95+
set(CMAKE_C_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})

src/CMakeLists.txt

+25-8
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,10 @@ add_fypp_sources(
105105
utils/dbcsr_toollib.F
106106
work/dbcsr_work_operations.F)
107107

108+
set(DBCSR_OPENCL_SRCS
109+
acc/opencl/acc_opencl.c acc/opencl/acc_opencl_event.c
110+
acc/opencl/acc_opencl_mem.c acc/opencl/acc_opencl_stream.c)
111+
108112
set(DBCSR_CUDA_SRCS
109113
acc/cuda/acc_cublas.cu
110114
acc/cuda/acc_cuda.cpp
@@ -141,13 +145,9 @@ add_library(dbcsr ${DBCSR_SRCS})
141145
set_target_properties(dbcsr PROPERTIES VERSION ${dbcsr_VERSION}
142146
SOVERSION ${dbcsr_APIVERSION})
143147

144-
if (TARGET PkgConfig::deps)
145-
target_link_libraries(dbcsr PRIVATE PkgConfig::deps)
146-
endif ()
147-
148-
if (USE_SMM MATCHES "libxsmm")
149-
# linker/include flags are managed by pkg-config (above)
148+
if (LIBXSMM_FOUND)
150149
target_compile_definitions(dbcsr PRIVATE __LIBXSMM)
150+
target_link_libraries(dbcsr PRIVATE PkgConfig::LIBXSMM)
151151
endif ()
152152

153153
if (BLAS_LIBRARIES MATCHES "mkl_")
@@ -203,6 +203,25 @@ if (OpenMP_FOUND)
203203
target_link_libraries(dbcsr PRIVATE OpenMP::OpenMP_Fortran)
204204
endif ()
205205

206+
# =================================================================================================
207+
# DBCSR LIBRARY's OPENCL BACKEND
208+
209+
if (USE_OPENCL)
210+
target_compile_definitions(dbcsr PRIVATE __DBCSR_ACC)
211+
target_link_libraries(dbcsr PUBLIC ${OpenCL_LIBRARY})
212+
213+
# OpenCL backend
214+
set(DBCSR_ACC_SRCS ${DBCSR_OPENCL_SRCS})
215+
add_library(acc OBJECT ${DBCSR_ACC_SRCS})
216+
target_compile_definitions(acc PRIVATE __OPENCL)
217+
# account for DBCSR not calling libsmm_acc_init() (DBCSR only calls acc_init)
218+
target_compile_definitions(acc PRIVATE __DBCSR_ACC)
219+
target_include_directories(acc PRIVATE ${OpenCL_INCLUDE_DIRS})
220+
target_sources(dbcsr PRIVATE $<TARGET_OBJECTS:acc>)
221+
add_subdirectory(acc/opencl/smm)
222+
target_sources(dbcsr PRIVATE $<TARGET_OBJECTS:libsmm_acc>)
223+
endif ()
224+
206225
# =================================================================================================
207226
# DBCSR LIBRARY's CUDA BACKEND
208227

@@ -297,7 +316,6 @@ endif ()
297316
# DBCSR LIBRARY's HIP BACKEND
298317

299318
if (USE_HIP)
300-
301319
if (USE_OPENMP)
302320
set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${OpenMP_CXX_FLAGS}")
303321
endif ()
@@ -335,7 +353,6 @@ if (USE_HIP)
335353

336354
target_compile_definitions(dbcsr PRIVATE __DBCSR_ACC)
337355
target_compile_definitions(dbcsr PRIVATE __HIP)
338-
339356
endif ()
340357

341358
# =================================================================================================

src/acc/PACKAGE

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
22
"description": "Generic accelerator API",
33
"archive": "libdbcsr",
4-
"requires": ["../base", "cuda", "hip", "libsmm_acc"]
4+
"requires": ["../base", "cuda", "hip", "opencl", "libsmm_acc"]
55
}

src/acc/acc_bench_trans.c

+19-17
Original file line numberDiff line numberDiff line change
@@ -52,18 +52,20 @@ static void swap(int* m, int* n) { int tmp = *m; *m = *n; *n = tmp; }
5252

5353
int main(int argc, char* argv[])
5454
{
55-
const int nrepeat = (1 < argc ? atoi(argv[1]) : 5), offset = 0;
55+
const int nrepeat = (1 < argc ? atoi(argv[1]) : 5);
5656
const int nodd = (0 < nrepeat ? ((nrepeat & 1/*odd*/) ? nrepeat : (nrepeat - 1)) : 1);
5757
const int stack_size = (2 < argc ? atoi(argv[2]) : 30000);
5858
const int m = (3 < argc ? atoi(argv[3]) : 23);
5959
const int n = (4 < argc ? atoi(argv[4]) : m);
60+
const int offset = (5 < argc ? atoi(argv[5]) : 0);
61+
const int offset_stack_size = offset + stack_size;
6062
#if defined(ALIGNMENT) && (0 < ALIGNMENT)
6163
const int mn = (int)ROUNDUP2(sizeof(ELEM_TYPE) * m, ALIGNMENT) * n / sizeof(ELEM_TYPE);
6264
#else
6365
const int mn = m * n;
6466
#endif
6567
#if defined(SHUFFLE)
66-
const size_t shuffle = libxsmm_shuffle((unsigned int)stack_size);
68+
const size_t shuffle = libxsmm_shuffle((unsigned int)offset_stack_size);
6769
#endif
6870
#if defined(WARMUP) && (0 < WARMUP) && !defined(_DEBUG)
6971
const int warmup = MAX(WARMUP, 2) / 2 * 2;
@@ -104,34 +106,34 @@ int main(int argc, char* argv[])
104106
#else
105107
CHECK(acc_stream_create(&stream, "stream", -1/*default priority*/), &result);
106108
#endif
107-
CHECK(acc_host_mem_allocate((void**)&mat_hst, sizeof(ELEM_TYPE) * mn * stack_size, stream), &result);
108-
CHECK(acc_host_mem_allocate((void**)&stack_hst, sizeof(int) * stack_size, stream), &result);
109+
CHECK(acc_host_mem_allocate((void**)&mat_hst, sizeof(ELEM_TYPE) * mn * offset_stack_size, stream), &result);
110+
CHECK(acc_host_mem_allocate((void**)&stack_hst, sizeof(int) * offset_stack_size, stream), &result);
109111
CHECK(acc_stream_sync(stream), &result); /* ensure host-data is allocated */
110-
for (i = 0; i < stack_size; ++i) { /* initialize matrices */
112+
for (i = 0; i < offset_stack_size; ++i) { /* initialize matrices */
111113
init(i/*seed*/, &mat_hst[i*mn], m, n);
112114
}
113-
for (i = 0; i < stack_size; ++i) { /* initialize indexes */
115+
for (i = 0; i < offset_stack_size; ++i) { /* initialize indexes */
114116
#if defined(SHUFFLE)
115-
const int j = mn * (int)((shuffle * i) % stack_size);
117+
const int j = mn * (int)((shuffle * i) % offset_stack_size);
116118
#else
117119
const int j = mn * i;
118120
#endif
119121
stack_hst[i] = j;
120122
}
121-
CHECK(acc_dev_mem_allocate((void**)&mat_dev, sizeof(ELEM_TYPE) * mn * stack_size), &result);
122-
CHECK(acc_dev_mem_allocate((void**)&stack_dev, sizeof(int) * stack_size), &result);
123+
CHECK(acc_dev_mem_allocate((void**)&mat_dev, sizeof(ELEM_TYPE) * mn * offset_stack_size), &result);
124+
CHECK(acc_dev_mem_allocate((void**)&stack_dev, sizeof(int) * offset_stack_size), &result);
123125
#if defined(USE_LIBXSMM)
124126
CHECK(acc_stream_sync(stream), &result);
125127
start = libxsmm_timer_tick();
126128
#endif
127-
CHECK(acc_memcpy_h2d(mat_hst, mat_dev, sizeof(ELEM_TYPE) * mn * stack_size, stream), &result);
128-
CHECK(acc_memcpy_h2d(stack_hst, stack_dev, sizeof(int) * stack_size, stream), &result);
129+
CHECK(acc_memcpy_h2d(mat_hst, mat_dev, sizeof(ELEM_TYPE) * mn * offset_stack_size, stream), &result);
130+
CHECK(acc_memcpy_h2d(stack_hst, stack_dev, sizeof(int) * offset_stack_size, stream), &result);
129131
#if defined(USE_LIBXSMM)
130132
CHECK(acc_stream_sync(stream), &result);
131133
duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
132134
printf("copy-in: %.1f ms %.1f GB/s\n", 1000.0 * duration,
133135
(sizeof(ELEM_TYPE) * mn + sizeof(int))
134-
* stack_size / (duration * (1ULL << 30)));
136+
* offset_stack_size / (duration * (1ULL << 30)));
135137
#endif
136138
/* warmup execution and prebuild JIT kernels */
137139
for (r = 0; r < warmup / 2; ++r) {
@@ -156,25 +158,25 @@ int main(int argc, char* argv[])
156158
assert(0 < nodd && (nodd & 1/*odd*/));
157159
printf("device: %.1f ms %.1f GB/s\n", 1000.0 * duration / nodd,
158160
(sizeof(ELEM_TYPE) * mn + sizeof(int))
159-
* stack_size / (duration * (1ULL << 30) / nodd));
161+
* offset_stack_size / (duration * (1ULL << 30) / nodd));
160162
mm = m; nn = n;
161163
start = libxsmm_timer_tick();
162164
for (r = 0; r < nodd; ++r) {
163165
libxsmm_itrans_batch_omp(mat_hst, sizeof(ELEM_TYPE), mm, nn, mm, nn,
164-
0/*index_base*/, sizeof(int)/*index_stride*/, stack_hst, stack_size);
166+
0/*index_base*/, sizeof(int)/*index_stride*/, stack_hst + offset, stack_size);
165167
swap(&mm, &nn);
166168
}
167169
duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
168170
printf("host: %.1f ms %.1f GB/s\n", 1000.0 * duration / nodd,
169171
(sizeof(ELEM_TYPE) * mn + sizeof(int))
170-
* stack_size / (duration * (1ULL << 30) / nodd));
172+
* offset_stack_size / (duration * (1ULL << 30) / nodd));
171173
/* transfer result from device to host for validation */
172174
CHECK(acc_memcpy_d2h(mat_dev, mat_hst,
173-
sizeof(ELEM_TYPE) * mn * stack_size, stream), &result);
175+
sizeof(ELEM_TYPE) * mn * offset_stack_size, stream), &result);
174176
CHECK(acc_stream_sync(stream), &result);
175177
if (EXIT_SUCCESS == result) {
176178
unsigned int nerrors = 0;
177-
for (i = 0; i < stack_size; ++i) {
179+
for (i = offset; i < offset_stack_size; ++i) {
178180
ELEM_TYPE gold[MAX_KERNEL_DIM*MAX_KERNEL_DIM];
179181
const ELEM_TYPE *const test = mat_hst + mn * i;
180182
init(i/*seed*/, gold, m, n);

0 commit comments

Comments
 (0)