diff --git a/.ci/daint.cscs.ch/cray.build.sh b/.ci/daint.cscs.ch/cray.build.sh index ed64c1ba521..82016b666a0 100755 --- a/.ci/daint.cscs.ch/cray.build.sh +++ b/.ci/daint.cscs.ch/cray.build.sh @@ -13,7 +13,7 @@ set -o errexit set -o nounset set -o pipefail -module load daint-gpu cudatoolkit CMake/3.14.5 +module load daint-gpu cudatoolkit CMake/3.18.4 module unload cray-libsci_acc # make sure a recent GCC is available as NVCC backend: # nvcc does not automatically use Cray's CC as backend diff --git a/.ci/daint.cscs.ch/cray.test.sh b/.ci/daint.cscs.ch/cray.test.sh index 33ecfab389f..ee5df55151b 100755 --- a/.ci/daint.cscs.ch/cray.test.sh +++ b/.ci/daint.cscs.ch/cray.test.sh @@ -13,7 +13,7 @@ set -o errexit set -o nounset set -o pipefail -module load daint-gpu cudatoolkit CMake/3.14.5 +module load daint-gpu cudatoolkit CMake/3.18.4 module unload cray-libsci_acc module list diff --git a/.ci/daint.cscs.ch/gnu.build.sh b/.ci/daint.cscs.ch/gnu.build.sh index 84157a2a743..76984955c40 100755 --- a/.ci/daint.cscs.ch/gnu.build.sh +++ b/.ci/daint.cscs.ch/gnu.build.sh @@ -14,7 +14,7 @@ set -o nounset set -o pipefail module swap PrgEnv-cray PrgEnv-gnu -module load daint-gpu cudatoolkit CMake/3.14.5 +module load daint-gpu cudatoolkit CMake/3.18.4 module unload cray-libsci_acc module list diff --git a/.ci/daint.cscs.ch/gnu.test.sh b/.ci/daint.cscs.ch/gnu.test.sh index c0469575f48..2ed9c13b050 100755 --- a/.ci/daint.cscs.ch/gnu.test.sh +++ b/.ci/daint.cscs.ch/gnu.test.sh @@ -14,7 +14,7 @@ set -o nounset set -o pipefail module swap PrgEnv-cray PrgEnv-gnu -module load daint-gpu cudatoolkit CMake/3.14.5 +module load daint-gpu cudatoolkit CMake/3.18.4 module unload cray-libsci_acc module list diff --git a/.ci/daint.cscs.ch/intel.build.sh b/.ci/daint.cscs.ch/intel.build.sh index 1b87da2f61b..4a92733d87d 100755 --- a/.ci/daint.cscs.ch/intel.build.sh +++ b/.ci/daint.cscs.ch/intel.build.sh @@ -14,7 +14,7 @@ set -o nounset set -o pipefail module swap PrgEnv-cray PrgEnv-intel -module load daint-gpu cudatoolkit CMake/3.14.5 +module load daint-gpu cudatoolkit CMake/3.18.4 module unload cray-libsci_acc # make sure a recent GCC is available as NVCC backend: # nvcc does not automatically use Cray's CC as backend diff --git a/.ci/daint.cscs.ch/intel.test.sh b/.ci/daint.cscs.ch/intel.test.sh index 03b94246565..0e3497a7cf0 100755 --- a/.ci/daint.cscs.ch/intel.test.sh +++ b/.ci/daint.cscs.ch/intel.test.sh @@ -14,7 +14,7 @@ set -o nounset set -o pipefail module swap PrgEnv-cray PrgEnv-intel -module load daint-gpu cudatoolkit CMake/3.14.5 +module load daint-gpu cudatoolkit CMake/3.18.4 module unload cray-libsci_acc module list diff --git a/.ci/daint.cscs.ch/ocl.build.sh b/.ci/daint.cscs.ch/ocl.build.sh index 5d0676e0788..52c153ee929 100755 --- a/.ci/daint.cscs.ch/ocl.build.sh +++ b/.ci/daint.cscs.ch/ocl.build.sh @@ -14,7 +14,7 @@ set -o nounset set -o pipefail module swap PrgEnv-cray PrgEnv-gnu -module load daint-gpu cudatoolkit CMake/3.14.5 +module load daint-gpu cudatoolkit CMake/3.18.4 module unload cray-libsci_acc module list diff --git a/.ci/daint.cscs.ch/ocl.test.sh b/.ci/daint.cscs.ch/ocl.test.sh index 19df5c35f22..c7a180bd22e 100755 --- a/.ci/daint.cscs.ch/ocl.test.sh +++ b/.ci/daint.cscs.ch/ocl.test.sh @@ -14,7 +14,7 @@ set -o nounset set -o pipefail module swap PrgEnv-cray PrgEnv-gnu -module load daint-gpu cudatoolkit CMake/3.14.5 +module load daint-gpu cudatoolkit CMake/3.18.4 module unload cray-libsci_acc module list diff --git a/.github/workflows/testing-linux.yml b/.github/workflows/testing-linux.yml index 0572a377c87..b274acaae56 100644 --- a/.github/workflows/testing-linux.yml +++ b/.github/workflows/testing-linux.yml @@ -103,6 +103,8 @@ jobs: -DUSE_${{ matrix.use_openmp }} \ -DUSE_ACCEL=hip \ -DWITH_GPU=Mi50 \ + -DWITH_EXAMPLES=ON \ + -DCMAKE_PREFIX_PATH=/opt/rocm \ .. - name: Build run: cmake --build build -- --verbose diff --git a/.gitignore b/.gitignore index a0b014766dc..e75b99731b7 100644 --- a/.gitignore +++ b/.gitignore @@ -223,3 +223,5 @@ tags .tags # End of https://www.gitignore.io/api/vim,emacs,python,fortran + +spack-* \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 544399b33fb..5e672d39da5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.12) +cmake_minimum_required(VERSION 3.17) # include our cmake snippets set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake) @@ -118,18 +118,28 @@ endif () # always use at least C++11 set(CMAKE_CXX_STANDARD 11) -# ================================================================================================= -# PACKAGE DISCOVERY (compiler configuration can impact package discovery) -find_package(PkgConfig) - # =================================== OpenMP if (USE_OPENMP) find_package(OpenMP REQUIRED) endif () +if ((USE_ACCEL MATCHES "opencl") AND (NOT USE_SMM MATCHES "libxsmm")) + message(FATAL_ERROR "OpenCL requires USE_SMM=libxsmm") +endif () + +# =================================== SMM (Small Matrix-Matrix multiplication) +if (USE_SMM MATCHES "blas") + message(STATUS "Using BLAS for Small Matrix Multiplication") +elseif (USE_SMM MATCHES "libxsmm") + message(STATUS "Using libxsmm for Small Matrix Multiplication") +else () + message(FATAL_ERROR "Unknown SMM library specified") +endif () + # =================================== LIBXSMM (rely on pkg-config) -if ((USE_SMM MATCHES "libxsmm") OR (USE_ACCEL MATCHES "opencl")) - pkg_check_modules(LIBXSMM IMPORTED_TARGET GLOBAL libxsmmf) +if (USE_SMM MATCHES "libxsmm") + find_package(PkgConfig REQUIRED) + pkg_check_modules(LIBXSMM REQUIRED IMPORTED_TARGET GLOBAL libxsmmf) endif () # =================================== BLAS & LAPACK, PkgConfig @@ -152,8 +162,6 @@ endif () # =================================== MPI if (USE_MPI) get_property(REQUIRED_MPI_COMPONENTS GLOBAL PROPERTY ENABLED_LANGUAGES) - list(REMOVE_ITEM REQUIRED_MPI_COMPONENTS CUDA) # CUDA does not have an MPI - # component if (NOT CMAKE_CROSSCOMPILING) # when cross compiling, assume the users know # what they are doing set(MPI_DETERMINE_LIBRARY_VERSION TRUE) @@ -181,34 +189,8 @@ Intel MPI compiler wrappers. Check the INSTALL.md for more information.") endif () endif () -# =================================== SMM (Small Matrix-Matrix multiplication) -if (USE_SMM MATCHES "blas") - message("-- Using BLAS for Small Matrix Multiplication") -elseif (USE_SMM MATCHES "libxsmm") - if (LIBXSMM_FOUND) - message("-- Using LIBXSMM for Small Matrix Multiplication") - else () - message( - FATAL_ERROR - "LIBXSMM is not found but requested (USE_SMM). " - "Please install PkgConfig, build LIBXSMM, and " - "set PKG_CONFIG_PATH=/path/to/libxsmm/lib") - endif () -else () - message(FATAL_ERROR "Unknown SMM library specified") -endif () - # =================================== GPU backends if (USE_ACCEL MATCHES "opencl") - if (NOT LIBXSMM_FOUND) - message( - FATAL_ERROR - "LIBXSMM is not found but required for " - "LIBSMM based on the ACC/OpenCL backend. " - "Please install PkgConfig, LIBXSMM, and " - "set PKG_CONFIG_PATH=/path/to/libxsmm/lib") - endif () - find_package(OpenCL REQUIRED) enable_language(C) endif () @@ -224,8 +206,9 @@ if (USE_ACCEL MATCHES "cuda|hip") endif () if (USE_ACCEL MATCHES "cuda") - enable_language(CUDA) - if (CMAKE_CUDA_COMPILER_VERSION LESS 5.5) + find_package(CUDAToolkit REQUIRED) + + if (CUDAToolkit_VERSION LESS 5.5) message(FATAL_ERROR "CUDA version >= 5.5 is required.") endif () @@ -237,43 +220,14 @@ if (USE_ACCEL MATCHES "cuda") "Please choose from: ${SUPPORTED_CUDA_ARCHITECTURES}") endif () - # assume that the backend compiler for nvcc understands the -std=c++11 - set(CMAKE_CUDA_STANDARD 11) - set(CMAKE_CUDA_STANDARD_REQUIRED ON) - # set cuda architecture number and compilation flags set(ACC_ARCH_NUMBER ${GPU_ARCH_NUMBER_${WITH_GPU}}) - # TODO: use CMAKE_CUDA_RUNTIME_LIBRARY with CMake 3.17+ and CUDA_ARCHITECTURES - # with CMake 3.18+ - string(APPEND CMAKE_CUDA_FLAGS " -arch=sm_${ACC_ARCH_NUMBER} --cudart static") - add_compile_definitions($<$:__CUDA>) + message(STATUS "GPU target architecture: " ${WITH_GPU}) message(STATUS "GPU architecture number: " ${ACC_ARCH_NUMBER}) message(STATUS "GPU profiling enabled: " ${WITH_CUDA_PROFILING}) - - # =================================== BLAS on GPU backend - find_library(CUBLAS cublas HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) - if (NOT CUBLAS) - message( - FATAL_ERROR - "cuBLAS library not found but support required for DBCSR's CUDA backend" - ) - else () - message(STATUS "Found cuBLAS: ${CUBLAS}") - endif () - if (WITH_CUDA_PROFILING) - find_library( - CUDA_NVTOOLSEXT nvToolsExt - PATHS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES} - DOC "Building with CUDA profiling requires the nvToolsExt CUDA library" - REQUIRED) - message(STATUS "Found nvToolsExt: ${CUDA_NVTOOLSEXT}") - endif () - endif () -# inspired from -# https://github.com/ROCm-Developer-Tools/HIP/tree/master/samples/2_Cookbook/12_cmake_hip_add_executable if (USE_ACCEL MATCHES "hip") # Make sure the GPU required is supported list(FIND SUPPORTED_HIP_ARCHITECTURES ${WITH_GPU} GPU_SUPPORTED) @@ -283,20 +237,8 @@ if (USE_ACCEL MATCHES "hip") "Please choose from: ${SUPPORTED_HIP_ARCHITECTURES}") endif () - # Set path to HIP installation, include HIP cmake utilities - if (NOT DEFINED HIP_PATH) - if (NOT DEFINED ENV{HIP_PATH}) - set(HIP_PATH - "/opt/rocm/hip" - CACHE PATH "Path to HIP installation") - else () - set(HIP_PATH - $ENV{HIP_PATH} - CACHE PATH "Path to HIP installation") - endif () - endif () - set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${HIP_PATH}/cmake") - + # ROCm is typically installed in /opt/rocm; otherwise let the user set + # ROCM_PATH as an environment variable or define. if (NOT DEFINED ROCM_PATH) if (NOT DEFINED ENV{ROCM_PATH}) set(ROCM_PATH @@ -309,60 +251,16 @@ if (USE_ACCEL MATCHES "hip") endif () endif () - # Find HIP package - find_package(HIP) - if (HIP_FOUND) - message(STATUS "Found HIP: " ${HIP_VERSION}) - else () - message( - FATAL_ERROR - "Could not find HIP. Ensure that HIP is either installed in /opt/rocm/hip or the variable HIP_PATH is set to point to the right location." - ) - endif () + # Notice: this is not FindHIP.cmake for hip language support, but + # hip-config.cmake which contains targets like hip::host for jitting. + find_package(hip CONFIG REQUIRED HINTS ${ROCM_PATH}) - # Find hiprtc library (adds support for JIT-ing in HIP) - find_library(ROCM_HIPRTC_LIB amdhip64 HINTS ${HIP_PATH}/lib) - if (NOT ROCM_HIPRTC_LIB) - message( - FATAL_ERROR "HIPRTC (HIP library for just-in-time compilation) not found") - endif () - set(ENV{HIP_PATH} /opt/rocm/hip) # workaround bug in hiprtc.cpp - - # Set platform to compile for (NVIDIA-nvcc or ROCm-hcc) as well as - # corresponding architecture and flags adapted from: - # https://github.com/ROCmSoftwarePlatform/hipDNN/blob/master/CMakeLists.txt - execute_process(COMMAND ${HIP_PATH}/bin/hipconfig -P - OUTPUT_VARIABLE HIP_PLATFORM) - message(STATUS "Compiling for platform: " ${HIP_PLATFORM}) - - # set appropriate compilation flags depending on platform set(ACC_ARCH_NUMBER ${GPU_ARCH_NUMBER_${WITH_GPU}}) - set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -D__HIP -O3") - if (${HIP_PLATFORM} STREQUAL "nvcc") - set(HIP_HIPCC_FLAGS - "${HIP_HIPCC_FLAGS} -std=c++11 -arch=sm_${ACC_ARCH_NUMBER} --cudart static" - ) - else () - set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -fPIC") - endif () message(STATUS "GPU target architecture: " ${WITH_GPU}) message(STATUS "GPU architecture number: " ${ACC_ARCH_NUMBER}) - message(STATUS "HIPCC flags: " ${HIP_HIPCC_FLAGS}) - if (USE_OPENMP) - set(HIP_OpenMP_FLAGS "-L${ROCM_PATH}/llvm/lib -lomp") - message(STATUS "HIP OpenMP linking flags: ${HIP_OpenMP_FLAGS}") - endif () # =================================== BLAS on GPU backend - find_library(HIPBLAS hipblas HINTS ${HIP_PATH}/../lib) # /opt/rocm/lib - if (NOT HIPBLAS) - message( - FATAL_ERROR - "hipBLAS library not found but support required for DBCSR's HIP backend" - ) - else () - message(STATUS "Found hipBLAS: ${HIPBLAS}") - endif () + find_package(hipblas CONFIG REQUIRED HINTS ${ROCM_PATH}) endif () # ================================================================================================= diff --git a/cmake/CompilerConfiguration.cmake b/cmake/CompilerConfiguration.cmake index af29b4dd0a2..351ea8d53b7 100644 --- a/cmake/CompilerConfiguration.cmake +++ b/cmake/CompilerConfiguration.cmake @@ -41,7 +41,7 @@ Please open an issue at https://github.com/cp2k/dbcsr/issues with the reported c endif () if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - set(CMAKE_CXX_FLAGS_RELEASE "-O3 -g -funroll-loops -Wall -Werror") + set(CMAKE_CXX_FLAGS_RELEASE "-O3 -g -funroll-loops -Wall") set(CMAKE_CXX_FLAGS_COVERAGE "-O0 -g --coverage -Wall -Werror") set(CMAKE_CXX_FLAGS_DEBUG "-O2 -ggdb -Wall -Werror -fsanitize=undefined -fsanitize=address -fsanitize-recover=all") if ((NOT (USE_MPI)) OR (NOT ("${MPI_Fortran_LIBRARY_VERSION_STRING}" MATCHES "Open MPI"))) diff --git a/docs/guide/2-user-guide/1-installation/index.md b/docs/guide/2-user-guide/1-installation/index.md index 88f01310b2b..52c9400a6cf 100644 --- a/docs/guide/2-user-guide/1-installation/index.md +++ b/docs/guide/2-user-guide/1-installation/index.md @@ -96,10 +96,13 @@ If MPI support is enabled (the default), the C API is automatically built. ### Workaround issue in HIP -HIP is a relatively new language, and some issues still need to be ironed out. As a workaround to an [issue](https://github.com/ROCm-Developer-Tools/HIP/pull/1543) in HIP's JIT infrastructure, please set the following if you've built HIP from source: +For custom installs of HIP 3.9.0 and above, some paths have to be configured to ensure the JIT compiler can locate the HIP runtime and compiler tools ```bash -export HIP_PATH=/opt/rocm/hip +export ROCM_PATH=/path/to/hip-3.9.0 +export HIP_PATH=$ROCM_PATH +export LLVM_PATH=/path/to/llvm-amdgpu-3.9.0 +export HIP_DEVICE_LIB_PATH=/path/to/rocm-device-libs-3.9.0/amdgcn/bitcode ``` before running on an AMD GPU. diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 6b7fdbadf3c..4af44e4a41f 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -6,11 +6,7 @@ set(DBCSR_PROGRAM_SRCS_CPP dbcsr_example_3.cpp dbcsr_tensor_example_2.cpp) # Compile Fortran examples foreach (dbcsr_program_src ${DBCSR_PROGRAM_SRCS_FTN}) get_filename_component(dbcsr_program_name ${dbcsr_program_src} NAME_WE) - if (USE_ACCEL MATCHES "hip") - hip_add_executable(${dbcsr_program_name} ${dbcsr_program_src}) - else () - add_executable(${dbcsr_program_name} ${dbcsr_program_src}) - endif () + add_executable(${dbcsr_program_name} ${dbcsr_program_src}) target_link_libraries(${dbcsr_program_name} dbcsr) # with the Intel compiler CMake 3.12 seems to forget that the source is @@ -24,11 +20,7 @@ if (WITH_C_API) foreach (dbcsr_program_src ${DBCSR_PROGRAM_SRCS_CPP}) get_filename_component(dbcsr_program_name ${dbcsr_program_src} NAME_WE) set(dbcsr_program_name ${dbcsr_program_name}_cpp) - if (USE_ACCEL MATCHES "hip") - hip_add_executable(${dbcsr_program_name} ${dbcsr_program_src}) - else () - add_executable(${dbcsr_program_name} ${dbcsr_program_src}) - endif () + add_executable(${dbcsr_program_name} ${dbcsr_program_src}) target_link_libraries(${dbcsr_program_name} dbcsr_c MPI::MPI_CXX) if (CMAKE_CXX_COMPILER_ID STREQUAL "Cray") diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f75ebc7da9b..351373e29fa 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -7,7 +7,7 @@ include(CMakePackageConfigHelpers) # ================================================================================================= # SOURCE FILE LISTS add_fypp_sources( - DBCSR_SRCS + DBCSR_FORTRAN_SRCS dbcsr_api.F acc/cuda/dbcsr_cuda_profiling.F acc/dbcsr_acc_device.F @@ -105,24 +105,11 @@ add_fypp_sources( utils/dbcsr_toollib.F work/dbcsr_work_operations.F) -set(DBCSR_OPENCL_SRCS - acc/opencl/acc_opencl.c acc/opencl/acc_opencl_event.c - acc/opencl/acc_opencl_mem.c acc/opencl/acc_opencl_stream.c) - -set(DBCSR_CUDA_SRCS - acc/cuda/acc_cublas.cu - acc/cuda/acc_cuda.cpp - acc/cuda/acc_dev.cpp - acc/cuda/acc_error.cpp - acc/cuda/acc_event.cpp - acc/cuda/acc_init.cpp - acc/cuda/acc_mem.cpp - acc/cuda/acc_stream.cpp - acc/cuda/dbcsr_cuda_nvtx_cu.cu) - -set(DBCSR_HIP_SRCS - acc/hip/acc_hip.cpp - acc/cuda/acc_cublas.cu +set(DBCSR_HIP_AND_CUDA_SRCS + acc/libsmm_acc/libsmm_acc_benchmark.cpp + acc/libsmm_acc/libsmm_acc_init.cpp + acc/libsmm_acc/libsmm_acc.cpp + acc/cuda/acc_cublas.cpp acc/cuda/acc_dev.cpp acc/cuda/acc_error.cpp acc/cuda/acc_event.cpp @@ -130,14 +117,34 @@ set(DBCSR_HIP_SRCS acc/cuda/acc_mem.cpp acc/cuda/acc_stream.cpp) +set(DBCSR_CUDA_SRCS ${DBCSR_HIP_AND_CUDA_SRCS} acc/cuda/acc_cuda.cpp + acc/cuda/dbcsr_cuda_nvtx_cu.cpp) + +set(DBCSR_HIP_SRCS ${DBCSR_HIP_AND_CUDA_SRCS} acc/hip/acc_hip.cpp) + +set(DBCSR_OPENCL_SRCS + acc/opencl/smm/opencl_libsmm.c acc/opencl/acc_opencl.c + acc/opencl/acc_opencl_event.c acc/opencl/acc_opencl_mem.c + acc/opencl/acc_opencl_stream.c) + # set the __SHORT_FILE__ per file for dbcsr sources -foreach (dbcsr_src ${DBCSR_SRCS}) +foreach (dbcsr_src ${DBCSR_FORTRAN_SRCS}) # add_fypp_sources returns a path in the current binary dir get_filename_component(short_file "${dbcsr_src}" NAME) set_source_files_properties( ${dbcsr_src} PROPERTIES COMPILE_DEFINITIONS __SHORT_FILE__="${short_file}") endforeach () +set(DBCSR_SRCS ${DBCSR_FORTRAN_SRCS}) + +if (USE_ACCEL MATCHES "cuda") + set(DBCSR_SRCS ${DBCSR_SRCS} ${DBCSR_CUDA_SRCS}) +elseif (USE_ACCEL MATCHES "hip") + set(DBCSR_SRCS ${DBCSR_SRCS} ${DBCSR_HIP_SRCS}) +elseif (USE_ACCEL MATCHES "opencl") + set(DBCSR_SRCS ${DBCSR_SRCS} ${DBCSR_OPENCL_SRCS}) +endif () + # ================================================================================================= # DBCSR LIBRARY add_library(dbcsr ${DBCSR_SRCS}) @@ -145,7 +152,7 @@ add_library(dbcsr ${DBCSR_SRCS}) set_target_properties(dbcsr PROPERTIES VERSION ${dbcsr_VERSION} SOVERSION ${dbcsr_APIVERSION}) -if (LIBXSMM_FOUND) +if (USE_SMM MATCHES "libxsmm") target_compile_definitions(dbcsr PRIVATE __LIBXSMM) target_link_libraries(dbcsr PRIVATE PkgConfig::LIBXSMM) endif () @@ -164,7 +171,7 @@ if (APPLE) endif () # set -DNDEBUG for Release builds -target_compile_definitions(dbcsr PRIVATE "$<$:NDEBUG>") +target_compile_definitions(dbcsr PRIVATE $<$:NDEBUG>) target_link_libraries(dbcsr PRIVATE ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) target_include_directories( @@ -196,163 +203,41 @@ if (MPI_FOUND) target_link_libraries(dbcsr PUBLIC MPI::MPI_Fortran) endif () -# ================================================================================================= -# Link OpenMP runtime library even if DBCSR main code is not built with OpenMP +target_link_libraries( + dbcsr + PRIVATE $<$:OpenMP::OpenMP_C> + $<$:OpenMP::OpenMP_CXX> + $<$:OpenMP::OpenMP_Fortran>) -if (OpenMP_FOUND) - target_link_libraries(dbcsr PRIVATE OpenMP::OpenMP_Fortran) +# todo, make this a bit better with opencl. +if (USE_ACCEL MATCHES "cuda|hip") + add_subdirectory(acc/libsmm_acc) endif () -# ================================================================================================= -# DBCSR LIBRARY's OPENCL BACKEND - if (USE_ACCEL MATCHES "opencl") - target_compile_definitions(dbcsr PRIVATE __DBCSR_ACC) - target_link_libraries(dbcsr PRIVATE ${OpenCL_LIBRARY}) - - # OpenCL backend - set(DBCSR_ACC_SRCS ${DBCSR_OPENCL_SRCS}) - add_library(acc OBJECT ${DBCSR_ACC_SRCS}) - target_compile_definitions(acc PRIVATE __OPENCL) - # account for DBCSR not calling libsmm_acc_init() (DBCSR only calls acc_init) - target_compile_definitions(acc PRIVATE __DBCSR_ACC) - target_include_directories(acc PRIVATE ${OpenCL_INCLUDE_DIRS}) - target_sources(dbcsr PRIVATE $) add_subdirectory(acc/opencl/smm) - target_sources(dbcsr PRIVATE $) endif () -# ================================================================================================= -# DBCSR LIBRARY's CUDA BACKEND - -# adapted from -# https://cliutils.gitlab.io/modern-cmake/chapters/packages/CUDA.html: replaces -# in the Fortran and CXX targets the interface compile options by a more -# differentiated version for Fortran, CXX and CUDA, where the CXX and CUDA -# options are taken from the CXX target -function (CUDA_CONVERT_FLAGS EXISTING_TARGET) - get_property( - old_fflags - TARGET ${EXISTING_TARGET}_Fortran - PROPERTY INTERFACE_COMPILE_OPTIONS) - get_property( - old_cxxflags - TARGET ${EXISTING_TARGET}_CXX - PROPERTY INTERFACE_COMPILE_OPTIONS) - - string(REPLACE ";" "," CUDA_flags "${old_cxxflags}") - if (NOT "${CUDA_flags}" STREQUAL "") - set(CUDA_flags "-Xcompiler=${CUDA_flags}") - endif () - - set_property( - TARGET ${EXISTING_TARGET}_Fortran - PROPERTY - INTERFACE_COMPILE_OPTIONS - "$<$>:${old_fflags}>$<$>:${old_cxxflags}>$<$>:${CUDA_flags}>" - ) - set_property( - TARGET ${EXISTING_TARGET}_CXX - PROPERTY - INTERFACE_COMPILE_OPTIONS - "$<$>:${old_fflags}>$<$>:${old_cxxflags}>$<$>:${CUDA_flags}>" - ) -endfunction () - -if (USE_ACCEL MATCHES "cuda") - if (${CMAKE_VERSION} VERSION_LESS 3.16) - # workaround for CUDA support with CMake <3.16, see also see - # https://gitlab.kitware.com/cmake/cmake/issues/17929 and - # https://cliutils.gitlab.io/modern-cmake/chapters/packages/CUDA.html - if (OpenMP_FOUND) - cuda_convert_flags(OpenMP::OpenMP) - endif () - if (MPI_FOUND) - cuda_convert_flags(MPI::MPI) - endif () - endif () - - target_link_libraries(dbcsr PUBLIC cuda) - - # add libsmm_acc which has its own CMakeLists.txt due to code generation - add_subdirectory(acc/libsmm_acc) - target_sources(dbcsr PRIVATE $) - target_link_libraries(dbcsr PRIVATE nvrtc) - - # Complete list of GPU-support sources - set(DBCSR_ACC_SRCS ${DBCSR_CUDA_SRCS}) - - # Make an object library - add_library(acc OBJECT ${DBCSR_ACC_SRCS}) - target_compile_definitions(acc PRIVATE __CUDA) - target_compile_definitions( - acc PRIVATE $<$:__CUDA_PROFILING>) - target_include_directories(acc - PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) - target_sources(dbcsr PRIVATE $) - target_compile_definitions(acc PRIVATE __DBCSR_ACC) - target_link_libraries(acc PRIVATE ${CUBLAS} ${CUDA_NVTOOLSEXT}) - - # tests need some of the libsmm_acc headers and for CMake <3.12 compatibility, - # we can't set it on the the object library itself - target_include_directories( - dbcsr PUBLIC $) - - target_compile_definitions(dbcsr PRIVATE __DBCSR_ACC) - target_link_libraries(dbcsr PRIVATE ${CUBLAS} ${CUDA_NVTOOLSEXT}) - target_compile_definitions(dbcsr PRIVATE __CUDA) - # __CUDA_PROFILING also affects core/ parts like the timings +if (USE_ACCEL) target_compile_definitions( - dbcsr PRIVATE $<$:__CUDA_PROFILING>) - - if (APPLE) - # Help the static cuda runtime find the driver (libcuda.dyllib) at runtime. - set_property(TARGET dbcsr PROPERTY BUILD_RPATH - ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) - endif () -endif () - -# ================================================================================================= -# DBCSR LIBRARY's HIP BACKEND - -if (USE_ACCEL MATCHES "hip") - if (USE_OPENMP) - set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${OpenMP_CXX_FLAGS}") - endif () - - # add libsmm_acc which has its own CMakeLists.txt due to code generation - add_subdirectory(acc/libsmm_acc) - target_link_libraries(dbcsr PUBLIC libsmm_acc) - target_link_libraries(dbcsr PUBLIC ${ROCM_HIPRTC_LIB}) - - # Complete list of GPU-support sources - set(DBCSR_ACC_SRCS ${DBCSR_HIP_SRCS}) - - # Compile the rest of the HIP source files into a static library - set_source_files_properties(${DBCSR_ACC_SRCS} - PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) - hip_add_library(acc STATIC ${DBCSR_ACC_SRCS}) - target_include_directories(acc PRIVATE ${HIP_PATH}/../include) - target_compile_definitions(acc PRIVATE ARCH_NUMBER=${ACC_ARCH_NUMBER}) - target_compile_definitions(acc PRIVATE __HIP) - target_compile_options(acc PRIVATE "-fPIC") - target_link_libraries(acc PUBLIC ${HIPBLAS}) - target_link_libraries(dbcsr PUBLIC acc) - install( - TARGETS acc - EXPORT accTargets - LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}" - ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}") - - # tests need some of the libsmm_acc headers and for CMake <3.12 compatibility, - # we can't set it on the the object library itself - target_include_directories( - dbcsr PUBLIC $) - target_include_directories( - dbcsr PUBLIC $) - - target_compile_definitions(dbcsr PRIVATE __DBCSR_ACC) - target_compile_definitions(dbcsr PRIVATE __HIP) + dbcsr + PRIVATE __DBCSR_ACC + $<$:__CUDA> + $<$:__OPENCL> + $<$:ARCH_NUMBER=${ACC_ARCH_NUMBER}> + $<$:__HIP> + $<$:ARCH_NUMBER=${ACC_ARCH_NUMBER}> + $<$:__CUDA_PROFILING>) + + target_link_libraries( + dbcsr + PRIVATE $<$:CUDA::cudart> + $<$:CUDA::cublas> + $<$:CUDA::nvToolsExt> + $<$:CUDA::nvrtc> + $<$:roc::hipblas> + $<$:hip::host> + $<$:OpenCL::OpenCL>) endif () # ================================================================================================= @@ -418,16 +303,6 @@ write_basic_package_version_file( "${CMAKE_CURRENT_BINARY_DIR}/DBCSRConfigVersion.cmake" VERSION "${dbcsr_VERSION}" COMPATIBILITY SameMajorVersion) -if (USE_ACCEL MATCHES "hip") - install( - EXPORT libsmm_accTargets - NAMESPACE "${config_namespace}" - DESTINATION "${config_install_dir}") - install( - EXPORT accTargets - NAMESPACE "${config_namespace}" - DESTINATION "${config_install_dir}") -endif () install( EXPORT DBCSRTargets NAMESPACE "${config_namespace}" diff --git a/src/acc/acc.h b/src/acc/acc.h index f60a5a17df7..bdfdf2eee3f 100644 --- a/src/acc/acc.h +++ b/src/acc/acc.h @@ -25,41 +25,41 @@ extern "C" { typedef int acc_bool_t; /** initialization and finalization */ -int acc_init(void); -int acc_finalize(void); -void acc_clear_errors(void); +int c_dbcsr_acc_init(void); +int c_dbcsr_acc_finalize(void); +void c_dbcsr_acc_clear_errors(void); /** devices */ -int acc_get_ndevices(int* ndevices); -int acc_set_active_device(int device_id); +int c_dbcsr_acc_get_ndevices(int* ndevices); +int c_dbcsr_acc_set_active_device(int device_id); /** streams */ -int acc_stream_priority_range(int* least, int* greatest); -int acc_stream_create(void** stream_p, const char* name, +int c_dbcsr_acc_stream_priority_range(int* least, int* greatest); +int c_dbcsr_acc_stream_create(void** stream_p, const char* name, /** lower number is higher priority */ int priority); -int acc_stream_destroy(void* stream); -int acc_stream_sync(void* stream); -int acc_stream_wait_event(void* stream, void* event); +int c_dbcsr_acc_stream_destroy(void* stream); +int c_dbcsr_acc_stream_sync(void* stream); +int c_dbcsr_acc_stream_wait_event(void* stream, void* event); /** events */ -int acc_event_create(void** event_p); -int acc_event_destroy(void* event); -int acc_event_record(void* event, void* stream); -int acc_event_query(void* event, acc_bool_t* has_occurred); -int acc_event_synchronize(void* event); +int c_dbcsr_acc_event_create(void** event_p); +int c_dbcsr_acc_event_destroy(void* event); +int c_dbcsr_acc_event_record(void* event, void* stream); +int c_dbcsr_acc_event_query(void* event, acc_bool_t* has_occurred); +int c_dbcsr_acc_event_synchronize(void* event); /** memory */ -int acc_dev_mem_allocate(void** dev_mem, size_t nbytes); -int acc_dev_mem_deallocate(void* dev_mem); -int acc_dev_mem_set_ptr(void** dev_mem, void* other, size_t lb); -int acc_host_mem_allocate(void** host_mem, size_t nbytes, void* stream); -int acc_host_mem_deallocate(void* host_mem, void* stream); -int acc_memcpy_h2d(const void* host_mem, void* dev_mem, size_t nbytes, void* stream); -int acc_memcpy_d2h(const void* dev_mem, void* host_mem, size_t nbytes, void* stream); -int acc_memcpy_d2d(const void* devmem_src, void* devmem_dst, size_t nbytes, void* stream); -int acc_memset_zero(void* dev_mem, size_t offset, size_t nbytes, void* stream); -int acc_dev_mem_info(size_t* mem_free, size_t* mem_total); +int c_dbcsr_acc_dev_mem_allocate(void** dev_mem, size_t nbytes); +int c_dbcsr_acc_dev_mem_deallocate(void* dev_mem); +int c_dbcsr_acc_dev_mem_set_ptr(void** dev_mem, void* other, size_t lb); +int c_dbcsr_acc_host_mem_allocate(void** host_mem, size_t nbytes, void* stream); +int c_dbcsr_acc_host_mem_deallocate(void* host_mem, void* stream); +int c_dbcsr_acc_memcpy_h2d(const void* host_mem, void* dev_mem, size_t nbytes, void* stream); +int c_dbcsr_acc_memcpy_d2h(const void* dev_mem, void* host_mem, size_t nbytes, void* stream); +int c_dbcsr_acc_memcpy_d2d(const void* devmem_src, void* devmem_dst, size_t nbytes, void* stream); +int c_dbcsr_acc_memset_zero(void* dev_mem, size_t offset, size_t nbytes, void* stream); +int c_dbcsr_acc_dev_mem_info(size_t* mem_free, size_t* mem_total); #if defined(__cplusplus) } diff --git a/src/acc/acc_bench_smm.c b/src/acc/acc_bench_smm.c index 25a750ae845..560f6b59e5e 100644 --- a/src/acc/acc_bench_smm.c +++ b/src/acc/acc_bench_smm.c @@ -105,10 +105,8 @@ int main(int argc, char* argv[]) assert(m <= (mn / n) && 0 == (mn % n) && k <= (mk / k) && 0 == (mk % k) && n <= (kn / n) && 0 == (kn % n)); printf("%s%s%i %i %i %i %i %i %i %i\n", 0 < argc ? argv[0] : "", 0 < argc ? " " : "", nrepeat, stack_size, m, n, k, nc, na, nb); - CHECK(acc_init(), &result); - /* note: libsmm_acc_init() may imply acc_init() */ - CHECK(libsmm_acc_init(), &result); - CHECK(acc_get_ndevices(&ndevices), &result); + CHECK(c_dbcsr_acc_init(), &result); + CHECK(c_dbcsr_acc_get_ndevices(&ndevices), &result); if (0 < ndevices) { #if defined(_DEBUG) fprintf(stderr, "number of devices found: %i\n", ndevices); @@ -118,20 +116,17 @@ int main(int argc, char* argv[]) #if defined(_DEBUG) fprintf(stderr, "Error: no device found!\n"); #endif -#if !defined(__CUDA) - CHECK(libsmm_acc_finalize(), NULL); -#endif - CHECK(acc_finalize(), NULL); + CHECK(c_dbcsr_acc_finalize(), NULL); return result; } printf("typename (id=%i): %s\n", DBCSR_TYPE(ELEM_TYPE), DBCSR_STRINGIFY(ELEM_TYPE)); - CHECK(acc_stream_create(&stream, "stream", -1/*default priority*/), &result); - CHECK(acc_host_mem_allocate((void**)&amat_hst, sizeof(ELEM_TYPE) * mk * na, stream), &result); - CHECK(acc_host_mem_allocate((void**)&bmat_hst, sizeof(ELEM_TYPE) * kn * nb, stream), &result); - CHECK(acc_host_mem_allocate((void**)&cmat_hst, sizeof(ELEM_TYPE) * mn * nc, stream), &result); - CHECK(acc_host_mem_allocate((void**)&stack_hst, sizeof(int) * 3 * stack_size, stream), &result); - CHECK(acc_host_mem_allocate((void**)&trans_hst, sizeof(int) * nb, stream), &result); - CHECK(acc_stream_sync(stream), &result); /* ensure host-data is allocated */ + CHECK(c_dbcsr_acc_stream_create(&stream, "stream", -1/*default priority*/), &result); + CHECK(c_dbcsr_acc_host_mem_allocate((void**)&amat_hst, sizeof(ELEM_TYPE) * mk * na, stream), &result); + CHECK(c_dbcsr_acc_host_mem_allocate((void**)&bmat_hst, sizeof(ELEM_TYPE) * kn * nb, stream), &result); + CHECK(c_dbcsr_acc_host_mem_allocate((void**)&cmat_hst, sizeof(ELEM_TYPE) * mn * nc, stream), &result); + CHECK(c_dbcsr_acc_host_mem_allocate((void**)&stack_hst, sizeof(int) * 3 * stack_size, stream), &result); + CHECK(c_dbcsr_acc_host_mem_allocate((void**)&trans_hst, sizeof(int) * nb, stream), &result); + CHECK(c_dbcsr_acc_stream_sync(stream), &result); /* ensure host-data is allocated */ /* initialize matrices */ for (i = 0; i < na; ++i) { init(i/*seed*/ + 42, &amat_hst[i*mk], m, k, 1.0 / (nc * na)); @@ -141,22 +136,22 @@ int main(int argc, char* argv[]) trans_hst[i] = i * kn; } init_stack(stack_hst, stack_size, mn, mk, kn, nc, na, nb); - CHECK(acc_dev_mem_allocate((void**)&amat_dev, sizeof(ELEM_TYPE) * mk * na), &result); - CHECK(acc_dev_mem_allocate((void**)&bmat_dev, sizeof(ELEM_TYPE) * kn * nb), &result); - CHECK(acc_dev_mem_allocate((void**)&cmat_dev, sizeof(ELEM_TYPE) * mn * nc), &result); - CHECK(acc_dev_mem_allocate((void**)&stack_dev, sizeof(int) * 3 * stack_size), &result); - CHECK(acc_dev_mem_allocate((void**)&trans_dev, sizeof(int) * nb), &result); - CHECK(acc_memset_zero(cmat_dev, 0/*offset*/, sizeof(ELEM_TYPE) * mn * nc, stream), &result); - CHECK(acc_memcpy_h2d(trans_hst, trans_dev, sizeof(int) * nb, stream), &result); + CHECK(c_dbcsr_acc_dev_mem_allocate((void**)&amat_dev, sizeof(ELEM_TYPE) * mk * na), &result); + CHECK(c_dbcsr_acc_dev_mem_allocate((void**)&bmat_dev, sizeof(ELEM_TYPE) * kn * nb), &result); + CHECK(c_dbcsr_acc_dev_mem_allocate((void**)&cmat_dev, sizeof(ELEM_TYPE) * mn * nc), &result); + CHECK(c_dbcsr_acc_dev_mem_allocate((void**)&stack_dev, sizeof(int) * 3 * stack_size), &result); + CHECK(c_dbcsr_acc_dev_mem_allocate((void**)&trans_dev, sizeof(int) * nb), &result); + CHECK(c_dbcsr_acc_memset_zero(cmat_dev, 0/*offset*/, sizeof(ELEM_TYPE) * mn * nc, stream), &result); + CHECK(c_dbcsr_acc_memcpy_h2d(trans_hst, trans_dev, sizeof(int) * nb, stream), &result); #if defined(USE_LIBXSMM) - CHECK(acc_stream_sync(stream), &result); + CHECK(c_dbcsr_acc_stream_sync(stream), &result); start = libxsmm_timer_tick(); #endif - CHECK(acc_memcpy_h2d(amat_hst, amat_dev, sizeof(ELEM_TYPE) * mk * na, stream), &result); - CHECK(acc_memcpy_h2d(bmat_hst, bmat_dev, sizeof(ELEM_TYPE) * kn * nb, stream), &result); - CHECK(acc_memcpy_h2d(stack_hst, stack_dev, sizeof(int) * 3 * stack_size, stream), &result); + CHECK(c_dbcsr_acc_memcpy_h2d(amat_hst, amat_dev, sizeof(ELEM_TYPE) * mk * na, stream), &result); + CHECK(c_dbcsr_acc_memcpy_h2d(bmat_hst, bmat_dev, sizeof(ELEM_TYPE) * kn * nb, stream), &result); + CHECK(c_dbcsr_acc_memcpy_h2d(stack_hst, stack_dev, sizeof(int) * 3 * stack_size, stream), &result); #if defined(USE_LIBXSMM) - CHECK(acc_stream_sync(stream), &result); + CHECK(c_dbcsr_acc_stream_sync(stream), &result); duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); printf("copy-in: %.1f ms %.1f GB/s\n", 1000.0 * duration, (sizeof(ELEM_TYPE) * (mk + kn) + sizeof(int) * 3) @@ -170,15 +165,15 @@ int main(int argc, char* argv[]) CHECK(libsmm_acc_transpose(trans_dev, 0/*offset*/, nb, bmat_dev, DBCSR_TYPE(ELEM_TYPE), n, k, MAX_KERNEL_DIM, stream), &result); } -# if defined(USE_LIBXSMM) - CHECK(acc_stream_sync(stream), &result); +#if defined(USE_LIBXSMM) + CHECK(c_dbcsr_acc_stream_sync(stream), &result); start = libxsmm_timer_tick(); -# endif +#endif /* to perform NN-SMMs on the device, all B-matrices are transposed upfront (SMM-kernel is limited to NT) */ CHECK(libsmm_acc_transpose(trans_dev, 0/*offset*/, nb, bmat_dev, DBCSR_TYPE(ELEM_TYPE), k, n, MAX_KERNEL_DIM, stream), &result); -# if defined(USE_LIBXSMM) - CHECK(acc_stream_sync(stream), &result); +#if defined(USE_LIBXSMM) + CHECK(c_dbcsr_acc_stream_sync(stream), &result); transpose = libxsmm_timer_duration(start, libxsmm_timer_tick()); # endif #endif @@ -187,9 +182,9 @@ int main(int argc, char* argv[]) CHECK(libsmm_acc_process(stack_hst, stack_dev, stack_size, 3/*nparams*/, DBCSR_TYPE(ELEM_TYPE), amat_dev, bmat_dev, cmat_dev, m, n, k, MAX_KERNEL_DIM, 1/*homogeneous*/, stream, stream), &result); } - CHECK(acc_memset_zero(cmat_dev, 0/*offset*/, sizeof(ELEM_TYPE) * mn * nc, stream), &result); + CHECK(c_dbcsr_acc_memset_zero(cmat_dev, 0/*offset*/, sizeof(ELEM_TYPE) * mn * nc, stream), &result); #if defined(USE_LIBXSMM) - CHECK(acc_stream_sync(stream), &result); + CHECK(c_dbcsr_acc_stream_sync(stream), &result); start = libxsmm_timer_tick(); #endif for (r = 0; r < nrepeat; ++r) { @@ -198,7 +193,7 @@ int main(int argc, char* argv[]) amat_dev, bmat_dev, cmat_dev, m, n, k, MAX_KERNEL_DIM, 1/*homogeneous*/, stream, stream), &result); } #if defined(USE_LIBXSMM) - CHECK(acc_stream_sync(stream), &result); + CHECK(c_dbcsr_acc_stream_sync(stream), &result); duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); # if defined(VALIDATE) && (0 != VALIDATE) if (0 != check && EXIT_SUCCESS == result) { @@ -234,8 +229,8 @@ int main(int argc, char* argv[]) printf("host: %.1f ms %.1f GFLOPS/s\n", 1000.0 * duration / nrepeat, ((size_t)2 * m * n * k) * stack_size / (duration * (1ULL << 30) / nrepeat)); /* transfer result from device to host for validation */ - CHECK(acc_memcpy_d2h(cmat_dev, cmat_hst, sizeof(ELEM_TYPE) * mn * nc, stream), &result); - CHECK(acc_stream_sync(stream), &result); + CHECK(c_dbcsr_acc_memcpy_d2h(cmat_dev, cmat_hst, sizeof(ELEM_TYPE) * mn * nc, stream), &result); + CHECK(c_dbcsr_acc_stream_sync(stream), &result); if (EXIT_SUCCESS == result) { double abserror = 0, relerror = 0; for (i = 0; i < nc; ++i) { @@ -276,21 +271,18 @@ int main(int argc, char* argv[]) ((size_t)2 * m * n * k) * stack_size / (duration * (1ULL << 30) / nrepeat)); } #endif - CHECK(acc_host_mem_deallocate(stack_hst, stream), NULL); - CHECK(acc_host_mem_deallocate(trans_hst, stream), NULL); - CHECK(acc_host_mem_deallocate(amat_hst, stream), NULL); - CHECK(acc_host_mem_deallocate(bmat_hst, stream), NULL); - CHECK(acc_host_mem_deallocate(cmat_hst, stream), NULL); - CHECK(acc_dev_mem_deallocate(stack_dev), NULL); - CHECK(acc_dev_mem_deallocate(trans_dev), NULL); - CHECK(acc_dev_mem_deallocate(amat_dev), NULL); - CHECK(acc_dev_mem_deallocate(bmat_dev), NULL); - CHECK(acc_dev_mem_deallocate(cmat_dev), NULL); - CHECK(acc_stream_destroy(stream), NULL); -#if !defined(__CUDA) - CHECK(libsmm_acc_finalize(), NULL); -#endif - CHECK(acc_finalize(), NULL); + CHECK(c_dbcsr_acc_host_mem_deallocate(stack_hst, stream), NULL); + CHECK(c_dbcsr_acc_host_mem_deallocate(trans_hst, stream), NULL); + CHECK(c_dbcsr_acc_host_mem_deallocate(amat_hst, stream), NULL); + CHECK(c_dbcsr_acc_host_mem_deallocate(bmat_hst, stream), NULL); + CHECK(c_dbcsr_acc_host_mem_deallocate(cmat_hst, stream), NULL); + CHECK(c_dbcsr_acc_dev_mem_deallocate(stack_dev), NULL); + CHECK(c_dbcsr_acc_dev_mem_deallocate(trans_dev), NULL); + CHECK(c_dbcsr_acc_dev_mem_deallocate(amat_dev), NULL); + CHECK(c_dbcsr_acc_dev_mem_deallocate(bmat_dev), NULL); + CHECK(c_dbcsr_acc_dev_mem_deallocate(cmat_dev), NULL); + CHECK(c_dbcsr_acc_stream_destroy(stream), NULL); + CHECK(c_dbcsr_acc_finalize(), NULL); if (EXIT_SUCCESS != result) { fprintf(stderr, "FAILED\n"); } diff --git a/src/acc/acc_bench_trans.c b/src/acc/acc_bench_trans.c index ccf4288ba82..b10bb0b3d82 100644 --- a/src/acc/acc_bench_trans.c +++ b/src/acc/acc_bench_trans.c @@ -90,10 +90,8 @@ int main(int argc, char* argv[]) #endif assert(m <= (mn / n) && 0 == (mn % n)); printf("%s%s%i %i %i %i\n", 0 < argc ? argv[0] : "", 0 < argc ? " " : "", nrepeat, stack_size, m, n); - CHECK(acc_init(), &result); - /* note: libsmm_acc_init() may imply acc_init() */ - CHECK(libsmm_acc_init(), &result); - CHECK(acc_get_ndevices(&ndevices), &result); + CHECK(c_dbcsr_acc_init(), &result); + CHECK(c_dbcsr_acc_get_ndevices(&ndevices), &result); if (0 < ndevices) { #if defined(_DEBUG) fprintf(stderr, "number of devices found: %i\n", ndevices); @@ -103,22 +101,19 @@ int main(int argc, char* argv[]) #if defined(_DEBUG) fprintf(stderr, "Error: no device found!\n"); #endif -#if !defined(__CUDA) - CHECK(libsmm_acc_finalize(), NULL); -#endif - CHECK(acc_finalize(), NULL); + CHECK(c_dbcsr_acc_finalize(), NULL); return result; } printf("typename (id=%i): %s\n", DBCSR_TYPE(ELEM_TYPE), DBCSR_STRINGIFY(ELEM_TYPE)); #if defined(PRIORITY) - CHECK(acc_stream_priority_range(&priomin, &priomax), &result); - CHECK(acc_stream_create(&stream, "stream", (priomin + priomax) / 2), &result); + CHECK(c_dbcsr_acc_stream_priority_range(&priomin, &priomax), &result); + CHECK(c_dbcsr_acc_stream_create(&stream, "stream", (priomin + priomax) / 2), &result); #else - CHECK(acc_stream_create(&stream, "stream", -1/*default priority*/), &result); + CHECK(c_dbcsr_acc_stream_create(&stream, "stream", -1/*default priority*/), &result); #endif - CHECK(acc_host_mem_allocate((void**)&mat_hst, sizeof(ELEM_TYPE) * mn * offset_stack_size, stream), &result); - CHECK(acc_host_mem_allocate((void**)&stack_hst, sizeof(int) * offset_stack_size, stream), &result); - CHECK(acc_stream_sync(stream), &result); /* ensure host-data is allocated */ + CHECK(c_dbcsr_acc_host_mem_allocate((void**)&mat_hst, sizeof(ELEM_TYPE) * mn * offset_stack_size, stream), &result); + CHECK(c_dbcsr_acc_host_mem_allocate((void**)&stack_hst, sizeof(int) * offset_stack_size, stream), &result); + CHECK(c_dbcsr_acc_stream_sync(stream), &result); /* ensure host-data is allocated */ for (i = 0; i < offset_stack_size; ++i) { /* initialize matrices */ init(i/*seed*/, &mat_hst[i*mn], m, n); } @@ -130,16 +125,16 @@ int main(int argc, char* argv[]) #endif stack_hst[i] = j; } - CHECK(acc_dev_mem_allocate((void**)&mat_dev, sizeof(ELEM_TYPE) * mn * offset_stack_size), &result); - CHECK(acc_dev_mem_allocate((void**)&stack_dev, sizeof(int) * offset_stack_size), &result); + CHECK(c_dbcsr_acc_dev_mem_allocate((void**)&mat_dev, sizeof(ELEM_TYPE) * mn * offset_stack_size), &result); + CHECK(c_dbcsr_acc_dev_mem_allocate((void**)&stack_dev, sizeof(int) * offset_stack_size), &result); #if defined(USE_LIBXSMM) - CHECK(acc_stream_sync(stream), &result); + CHECK(c_dbcsr_acc_stream_sync(stream), &result); start = libxsmm_timer_tick(); #endif - CHECK(acc_memcpy_h2d(mat_hst, mat_dev, sizeof(ELEM_TYPE) * mn * offset_stack_size, stream), &result); - CHECK(acc_memcpy_h2d(stack_hst, stack_dev, sizeof(int) * offset_stack_size, stream), &result); + CHECK(c_dbcsr_acc_memcpy_h2d(mat_hst, mat_dev, sizeof(ELEM_TYPE) * mn * offset_stack_size, stream), &result); + CHECK(c_dbcsr_acc_memcpy_h2d(stack_hst, stack_dev, sizeof(int) * offset_stack_size, stream), &result); #if defined(USE_LIBXSMM) - CHECK(acc_stream_sync(stream), &result); + CHECK(c_dbcsr_acc_stream_sync(stream), &result); duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); printf("copy-in: %.1f ms %.1f GB/s\n", 1000.0 * duration, (sizeof(ELEM_TYPE) * mn + sizeof(int)) @@ -153,7 +148,7 @@ int main(int argc, char* argv[]) DBCSR_TYPE(ELEM_TYPE), n, m, MAX_KERNEL_DIM, stream), &result); } #if defined(USE_LIBXSMM) - CHECK(acc_stream_sync(stream), &result); + CHECK(c_dbcsr_acc_stream_sync(stream), &result); start = libxsmm_timer_tick(); #endif for (r = 0; r < nodd; ++r) { @@ -162,7 +157,7 @@ int main(int argc, char* argv[]) swap(&mm, &nn); } #if defined(USE_LIBXSMM) - CHECK(acc_stream_sync(stream), &result); + CHECK(c_dbcsr_acc_stream_sync(stream), &result); duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); if (EXIT_SUCCESS == result) { assert(0 < nodd && (nodd & 1/*odd*/)); @@ -181,9 +176,9 @@ int main(int argc, char* argv[]) (sizeof(ELEM_TYPE) * mn + sizeof(int)) * offset_stack_size / (duration * (1ULL << 30) / nodd)); /* transfer result from device to host for validation */ - CHECK(acc_memcpy_d2h(mat_dev, mat_hst, + CHECK(c_dbcsr_acc_memcpy_d2h(mat_dev, mat_hst, sizeof(ELEM_TYPE) * mn * offset_stack_size, stream), &result); - CHECK(acc_stream_sync(stream), &result); + CHECK(c_dbcsr_acc_stream_sync(stream), &result); if (EXIT_SUCCESS == result) { unsigned int nerrors = 0; for (i = offset; i < offset_stack_size; ++i) { @@ -210,15 +205,12 @@ int main(int argc, char* argv[]) } } #endif - CHECK(acc_host_mem_deallocate(stack_hst, stream), NULL); - CHECK(acc_host_mem_deallocate(mat_hst, stream), NULL); - CHECK(acc_dev_mem_deallocate(stack_dev), NULL); - CHECK(acc_dev_mem_deallocate(mat_dev), NULL); - CHECK(acc_stream_destroy(stream), NULL); -#if !defined(__CUDA) - CHECK(libsmm_acc_finalize(), NULL); -#endif - CHECK(acc_finalize(), NULL); + CHECK(c_dbcsr_acc_host_mem_deallocate(stack_hst, stream), NULL); + CHECK(c_dbcsr_acc_host_mem_deallocate(mat_hst, stream), NULL); + CHECK(c_dbcsr_acc_dev_mem_deallocate(stack_dev), NULL); + CHECK(c_dbcsr_acc_dev_mem_deallocate(mat_dev), NULL); + CHECK(c_dbcsr_acc_stream_destroy(stream), NULL); + CHECK(c_dbcsr_acc_finalize(), NULL); if (EXIT_SUCCESS != result) { fprintf(stderr, "FAILED\n"); } diff --git a/src/acc/cuda/acc_cublas.cu b/src/acc/cuda/acc_cublas.cpp similarity index 100% rename from src/acc/cuda/acc_cublas.cu rename to src/acc/cuda/acc_cublas.cpp diff --git a/src/acc/cuda/acc_dev.cpp b/src/acc/cuda/acc_dev.cpp index b795fe319c4..2381b6c90a9 100644 --- a/src/acc/cuda/acc_dev.cpp +++ b/src/acc/cuda/acc_dev.cpp @@ -23,14 +23,14 @@ static const int verbose_print = 1; /****************************************************************************/ -extern "C" int acc_get_ndevices(int *n_devices){ +extern "C" int c_dbcsr_acc_get_ndevices(int *n_devices){ ACC_API_CALL(GetDeviceCount, (n_devices)); return 0; } /****************************************************************************/ -extern "C" int acc_set_active_device(int device_id){ +extern "C" int c_dbcsr_acc_set_active_device(int device_id){ int myDevice, runtimeVersion; ACC_API_CALL(RuntimeGetVersion, (&runtimeVersion)); diff --git a/src/acc/cuda/acc_error.cpp b/src/acc/cuda/acc_error.cpp index 4c9804c7c99..4e2bccd11fd 100644 --- a/src/acc/cuda/acc_error.cpp +++ b/src/acc/cuda/acc_error.cpp @@ -27,6 +27,6 @@ int acc_error_check (ACC(Error_t) error){ return 0; } -extern "C" void acc_clear_errors () { +extern "C" void c_dbcsr_acc_clear_errors () { ACC(GetLastError)(); } diff --git a/src/acc/cuda/acc_event.cpp b/src/acc/cuda/acc_event.cpp index 0a8745b685b..fd86c324fcd 100644 --- a/src/acc/cuda/acc_event.cpp +++ b/src/acc/cuda/acc_event.cpp @@ -22,7 +22,7 @@ static const int verbose_print = 0; /****************************************************************************/ -extern "C" int acc_event_create(void** event_p){ +extern "C" int c_dbcsr_acc_event_create(void** event_p){ *event_p = malloc(sizeof(ACC(Event_t))); ACC(Event_t)* acc_event = (ACC(Event_t)*) *event_p; @@ -35,7 +35,7 @@ extern "C" int acc_event_create(void** event_p){ /****************************************************************************/ -extern "C" int acc_event_destroy(void* event){ +extern "C" int c_dbcsr_acc_event_destroy(void* event){ ACC(Event_t)* acc_event = (ACC(Event_t*)) event; if(verbose_print) printf("EventDestroy, called\n"); @@ -49,7 +49,7 @@ extern "C" int acc_event_destroy(void* event){ /****************************************************************************/ -extern "C" int acc_event_record(void* event, void* stream){ +extern "C" int c_dbcsr_acc_event_record(void* event, void* stream){ ACC(Event_t)* acc_event = (ACC(Event_t)*) event; ACC(Stream_t)* acc_stream = (ACC(Stream_t)*) stream; @@ -60,8 +60,8 @@ extern "C" int acc_event_record(void* event, void* stream){ /****************************************************************************/ -extern "C" int acc_event_query(void* event, int* has_occurred){ - if(verbose_print) printf("acc_event_query called\n"); +extern "C" int c_dbcsr_acc_event_query(void* event, int* has_occurred){ + if(verbose_print) printf("dbcsr_acc_event_query called\n"); ACC(Event_t)* acc_event = (ACC(Event_t)*) event; ACC(Error_t) cErr = ACC(EventQuery)(*acc_event); @@ -80,8 +80,8 @@ extern "C" int acc_event_query(void* event, int* has_occurred){ /****************************************************************************/ -extern "C" int acc_stream_wait_event(void* stream, void* event){ - if(verbose_print) printf("acc_stream_wait_event called\n"); +extern "C" int c_dbcsr_acc_stream_wait_event(void* stream, void* event){ + if(verbose_print) printf("c_dbcsr_acc_stream_wait_event called\n"); ACC(Event_t)* acc_event = (ACC(Event_t)*) event; ACC(Stream_t)* acc_stream = (ACC(Stream_t)*) stream; @@ -93,7 +93,7 @@ extern "C" int acc_stream_wait_event(void* stream, void* event){ /****************************************************************************/ -extern "C" int acc_event_synchronize(void* event){ +extern "C" int c_dbcsr_acc_event_synchronize(void* event){ if(verbose_print) printf("EventSynchronize called\n"); ACC(Event_t)* acc_event = (ACC(Event_t)*) event; ACC_API_CALL(EventSynchronize, (*acc_event)); diff --git a/src/acc/cuda/acc_init.cpp b/src/acc/cuda/acc_init.cpp index d3a114aa8c3..8fc7f21c73e 100644 --- a/src/acc/cuda/acc_init.cpp +++ b/src/acc/cuda/acc_init.cpp @@ -23,7 +23,7 @@ #endif /****************************************************************************/ -extern "C" int acc_init(){ +extern "C" int c_dbcsr_acc_init(){ int myDevice; // Driver boilerplate ACC_DRV_CALL(Init, (0)); @@ -38,7 +38,7 @@ extern "C" int acc_init(){ } /****************************************************************************/ -extern "C" int acc_finalize(){ +extern "C" int c_dbcsr_acc_finalize(){ int myDevice; // Release driver resources ACC_DRV(device) acc_device; diff --git a/src/acc/cuda/acc_mem.cpp b/src/acc/cuda/acc_mem.cpp index 52168e020a9..9fa30c535f7 100644 --- a/src/acc/cuda/acc_mem.cpp +++ b/src/acc/cuda/acc_mem.cpp @@ -21,9 +21,12 @@ static const int verbose_print = 0; +// some api calls have changed, but we wrap them internally +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" /****************************************************************************/ -extern "C" int acc_dev_mem_allocate(void **dev_mem, size_t n){ +extern "C" int c_dbcsr_acc_dev_mem_allocate(void **dev_mem, size_t n){ ACC_API_CALL(Malloc, ((void **) dev_mem, (size_t) n)); if (dev_mem == NULL) return -2; @@ -35,7 +38,7 @@ extern "C" int acc_dev_mem_allocate(void **dev_mem, size_t n){ /****************************************************************************/ -extern "C" int acc_dev_mem_deallocate(void *dev_mem){ +extern "C" int c_dbcsr_acc_dev_mem_deallocate(void *dev_mem){ if (verbose_print) printf ("Device deallocation address %p\n", dev_mem); ACC_API_CALL(Free, ((void *) dev_mem)); @@ -45,7 +48,7 @@ extern "C" int acc_dev_mem_deallocate(void *dev_mem){ /****************************************************************************/ -extern "C" int acc_host_mem_allocate(void **host_mem, size_t n, void *stream){ +extern "C" int c_dbcsr_acc_host_mem_allocate(void **host_mem, size_t n, void *stream){ unsigned int flag = ACC(HostAllocDefault); ACC_API_CALL(HostAlloc, ((void **) host_mem, (size_t) n, flag)); @@ -59,7 +62,7 @@ extern "C" int acc_host_mem_allocate(void **host_mem, size_t n, void *stream){ /****************************************************************************/ -extern "C" int acc_host_mem_deallocate(void *host_mem, void *stream){ +extern "C" int c_dbcsr_acc_host_mem_deallocate(void *host_mem, void *stream){ if (verbose_print) printf ("Host pinned deallocation address %p\n", host_mem); ACC_API_CALL(FreeHost, ((void *) host_mem)); @@ -68,7 +71,7 @@ extern "C" int acc_host_mem_deallocate(void *host_mem, void *stream){ } /****************************************************************************/ -extern "C" int acc_dev_mem_set_ptr(void **dev_mem, void *other, size_t lb){ +extern "C" int c_dbcsr_acc_dev_mem_set_ptr(void **dev_mem, void *other, size_t lb){ (*dev_mem) = ((char *) other) + lb; @@ -76,7 +79,7 @@ extern "C" int acc_dev_mem_set_ptr(void **dev_mem, void *other, size_t lb){ } /****************************************************************************/ -extern "C" int acc_memcpy_h2d(const void *host_mem, void *dev_mem, size_t count, void* stream){ +extern "C" int c_dbcsr_acc_memcpy_h2d(const void *host_mem, void *dev_mem, size_t count, void* stream){ ACC(Stream_t)* acc_stream = (ACC(Stream_t)*) stream; if (verbose_print) printf ("Copying %zd bytes from host address %p to device address %p \n", count, host_mem, dev_mem); @@ -88,7 +91,7 @@ extern "C" int acc_memcpy_h2d(const void *host_mem, void *dev_mem, size_t count, /****************************************************************************/ -extern "C" int acc_memcpy_d2h(const void *dev_mem, void *host_mem, size_t count, void* stream){ +extern "C" int c_dbcsr_acc_memcpy_d2h(const void *dev_mem, void *host_mem, size_t count, void* stream){ ACC(Stream_t)* acc_stream = (ACC(Stream_t)*) stream; if (verbose_print) printf ("Copying %zd bytes from device address %p to host address %p\n", count, dev_mem, host_mem); @@ -103,7 +106,7 @@ extern "C" int acc_memcpy_d2h(const void *dev_mem, void *host_mem, size_t count, /****************************************************************************/ -extern "C" int acc_memcpy_d2d(const void *devmem_src, void *devmem_dst, size_t count, void* stream){ +extern "C" int c_dbcsr_acc_memcpy_d2d(const void *devmem_src, void *devmem_dst, size_t count, void* stream){ ACC(Stream_t)* acc_stream = (ACC(Stream_t)*) stream; if (verbose_print) printf ("Copying %zd bytes from device address %p to device address %p \n", count, devmem_src, devmem_dst); @@ -120,7 +123,7 @@ extern "C" int acc_memcpy_d2d(const void *devmem_src, void *devmem_dst, size_t c /****************************************************************************/ -extern "C" int acc_memset_zero(void *dev_mem, size_t offset, size_t length, void* stream){ +extern "C" int c_dbcsr_acc_memset_zero(void *dev_mem, size_t offset, size_t length, void* stream){ ACC(Error_t) cErr; ACC(Stream_t)* acc_stream = (ACC(Stream_t)*) stream; if(stream == NULL){ @@ -142,7 +145,9 @@ extern "C" int acc_memset_zero(void *dev_mem, size_t offset, size_t length, void /****************************************************************************/ -extern "C" int acc_dev_mem_info(size_t* free, size_t* avail){ +extern "C" int c_dbcsr_acc_dev_mem_info(size_t* free, size_t* avail){ ACC_API_CALL(MemGetInfo, (free, avail)); return 0; } + +#pragma GCC diagnostic pop \ No newline at end of file diff --git a/src/acc/cuda/acc_stream.cpp b/src/acc/cuda/acc_stream.cpp index 4379d234b55..0da98999d49 100644 --- a/src/acc/cuda/acc_stream.cpp +++ b/src/acc/cuda/acc_stream.cpp @@ -27,7 +27,7 @@ static const int verbose_print = 0; /****************************************************************************/ -extern "C" int acc_stream_priority_range(int* least, int* greatest){ +extern "C" int c_dbcsr_acc_stream_priority_range(int* least, int* greatest){ *least = -1; *greatest = -1; ACC_API_CALL(DeviceGetStreamPriorityRange, (least, greatest)); @@ -37,7 +37,7 @@ extern "C" int acc_stream_priority_range(int* least, int* greatest){ /****************************************************************************/ -extern "C" int acc_stream_create(void** stream_p, const char* name, int priority){ +extern "C" int c_dbcsr_acc_stream_create(void** stream_p, const char* name, int priority){ ACC(Error_t) cErr; *stream_p = malloc(sizeof(ACC(Stream_t))); @@ -63,7 +63,7 @@ extern "C" int acc_stream_create(void** stream_p, const char* name, int priority /****************************************************************************/ -extern "C" int acc_stream_destroy(void* stream){ +extern "C" int c_dbcsr_acc_stream_destroy(void* stream){ ACC(Stream_t)* acc_stream = (ACC(Stream_t)*) stream; if(verbose_print) printf("StreamDestroy called\n"); @@ -76,7 +76,7 @@ extern "C" int acc_stream_destroy(void* stream){ } /****************************************************************************/ -extern "C" int acc_stream_sync(void* stream) +extern "C" int c_dbcsr_acc_stream_sync(void* stream) { ACC(Stream_t)* acc_stream = (ACC(Stream_t)*) stream; ACC_API_CALL(StreamSynchronize, (*acc_stream)); diff --git a/src/acc/cuda/dbcsr_cuda_nvtx_cu.cu b/src/acc/cuda/dbcsr_cuda_nvtx_cu.cpp similarity index 100% rename from src/acc/cuda/dbcsr_cuda_nvtx_cu.cu rename to src/acc/cuda/dbcsr_cuda_nvtx_cu.cpp diff --git a/src/acc/dbcsr_acc_device.F b/src/acc/dbcsr_acc_device.F index 36a5ac0811d..cf50e574c11 100644 --- a/src/acc/dbcsr_acc_device.F +++ b/src/acc/dbcsr_acc_device.F @@ -24,7 +24,7 @@ MODULE dbcsr_acc_device #if defined (__DBCSR_ACC) INTERFACE FUNCTION acc_get_ndevices_cu(n_devices) RESULT(istat) & - BIND(C, name="acc_get_ndevices") + BIND(C, name="c_dbcsr_acc_get_ndevices") IMPORT INTEGER(KIND=C_INT), INTENT(OUT) :: n_devices INTEGER(KIND=C_INT) :: istat @@ -32,7 +32,7 @@ FUNCTION acc_get_ndevices_cu(n_devices) RESULT(istat) & END FUNCTION acc_get_ndevices_cu FUNCTION acc_set_active_device_cu(device_id) RESULT(istat) & - BIND(C, name="acc_set_active_device") + BIND(C, name="c_dbcsr_acc_set_active_device") IMPORT INTEGER(KIND=C_INT), INTENT(IN), VALUE :: device_id INTEGER(KIND=C_INT) :: istat @@ -40,7 +40,7 @@ FUNCTION acc_set_active_device_cu(device_id) RESULT(istat) & END FUNCTION acc_set_active_device_cu SUBROUTINE acc_clear_errors_cu() & - BIND(C, NAME="acc_clear_errors") + BIND(C, name="c_dbcsr_acc_clear_errors") END SUBROUTINE acc_clear_errors_cu END INTERFACE @@ -77,7 +77,7 @@ SUBROUTINE dbcsr_acc_set_active_device(device_id) !$OMP PARALLEL DEFAULT(NONE) PRIVATE(istat) SHARED(device_id) istat = acc_set_active_device_cu(device_id) IF (istat /= 0) & - DBCSR_ABORT("acc_set_active_device: failed") + DBCSR_ABORT("dbcsr_acc_set_active_device: failed") !$OMP END PARALLEL #else diff --git a/src/acc/dbcsr_acc_devmem.F b/src/acc/dbcsr_acc_devmem.F index b1a8d03019c..8ff4d86f6a2 100644 --- a/src/acc/dbcsr_acc_devmem.F +++ b/src/acc/dbcsr_acc_devmem.F @@ -76,7 +76,7 @@ MODULE dbcsr_acc_devmem #if defined (__DBCSR_ACC) INTERFACE - FUNCTION acc_interface_dev_mem_info(free, avail) RESULT(istat) BIND(C, name="acc_dev_mem_info") + FUNCTION acc_interface_dev_mem_info(free, avail) RESULT(istat) BIND(C, name="c_dbcsr_acc_dev_mem_info") IMPORT INTEGER(KIND=C_SIZE_T), INTENT(OUT) :: free, avail INTEGER(KIND=C_INT) :: istat @@ -85,7 +85,7 @@ END FUNCTION acc_interface_dev_mem_info END INTERFACE INTERFACE - FUNCTION acc_interface_dev_mem_alloc(mem, n) RESULT(istat) BIND(C, name="acc_dev_mem_allocate") + FUNCTION acc_interface_dev_mem_alloc(mem, n) RESULT(istat) BIND(C, name="c_dbcsr_acc_dev_mem_allocate") IMPORT TYPE(C_PTR) :: mem INTEGER(KIND=C_SIZE_T), INTENT(IN), & @@ -96,7 +96,7 @@ END FUNCTION acc_interface_dev_mem_alloc END INTERFACE INTERFACE - FUNCTION acc_interface_dev_mem_dealloc(mem) RESULT(istat) BIND(C, name="acc_dev_mem_deallocate") + FUNCTION acc_interface_dev_mem_dealloc(mem) RESULT(istat) BIND(C, name="c_dbcsr_acc_dev_mem_deallocate") IMPORT TYPE(C_PTR), VALUE :: mem INTEGER(KIND=C_INT) :: istat @@ -105,7 +105,7 @@ END FUNCTION acc_interface_dev_mem_dealloc END INTERFACE INTERFACE - FUNCTION acc_interface_dev_mem_set_ptr(mem, other, lb) RESULT(istat) BIND(C, name="acc_dev_mem_set_ptr") + FUNCTION acc_interface_dev_mem_set_ptr(mem, other, lb) RESULT(istat) BIND(C, name="c_dbcsr_acc_dev_mem_set_ptr") IMPORT TYPE(C_PTR) :: mem TYPE(C_PTR), VALUE :: other @@ -117,7 +117,7 @@ END FUNCTION acc_interface_dev_mem_set_ptr END INTERFACE INTERFACE - FUNCTION acc_interface_memzero(this, offset, length, stream_ptr) RESULT(istat) BIND(C, name="acc_memset_zero") + FUNCTION acc_interface_memzero(this, offset, length, stream_ptr) RESULT(istat) BIND(C, name="c_dbcsr_acc_memset_zero") IMPORT TYPE(C_PTR), INTENT(IN), VALUE :: this INTEGER(KIND=C_SIZE_T), INTENT(IN), & @@ -130,7 +130,7 @@ END FUNCTION acc_interface_memzero INTERFACE FUNCTION acc_interface_memcpy_h2d(host, dev, count, stream_ptr) RESULT(istat) & - BIND(C, name="acc_memcpy_h2d") + BIND(C, name="c_dbcsr_acc_memcpy_h2d") IMPORT TYPE(C_PTR), INTENT(IN), VALUE :: host TYPE(C_PTR), VALUE :: dev @@ -144,7 +144,7 @@ END FUNCTION acc_interface_memcpy_h2d INTERFACE FUNCTION acc_interface_memcpy_d2h(dev, host, count, stream_ptr) RESULT(istat) & - BIND(C, name="acc_memcpy_d2h") + BIND(C, name="c_dbcsr_acc_memcpy_d2h") IMPORT TYPE(C_PTR), INTENT(IN), VALUE :: dev TYPE(C_PTR), VALUE :: host @@ -158,7 +158,7 @@ END FUNCTION acc_interface_memcpy_d2h INTERFACE FUNCTION acc_interface_memcpy_d2d(dev_src, dev_dst, count, stream_ptr) RESULT(istat) & - BIND(C, name="acc_memcpy_d2d") + BIND(C, name="c_dbcsr_acc_memcpy_d2d") IMPORT TYPE(C_PTR), INTENT(IN), VALUE :: dev_src TYPE(C_PTR), VALUE :: dev_dst diff --git a/src/acc/dbcsr_acc_event.F b/src/acc/dbcsr_acc_event.F index c9276134001..49183aacb9c 100644 --- a/src/acc/dbcsr_acc_event.F +++ b/src/acc/dbcsr_acc_event.F @@ -39,7 +39,7 @@ MODULE dbcsr_acc_event #if defined (__DBCSR_ACC) INTERFACE - FUNCTION acc_interface_event_create(event_ptr) RESULT(istat) BIND(C, name="acc_event_create") + FUNCTION acc_interface_event_create(event_ptr) RESULT(istat) BIND(C, name="c_dbcsr_acc_event_create") IMPORT TYPE(C_PTR) :: event_ptr INTEGER(KIND=C_INT) :: istat @@ -48,7 +48,7 @@ END FUNCTION acc_interface_event_create END INTERFACE INTERFACE - FUNCTION acc_interface_event_destroy(event_ptr) RESULT(istat) BIND(C, name="acc_event_destroy") + FUNCTION acc_interface_event_destroy(event_ptr) RESULT(istat) BIND(C, name="c_dbcsr_acc_event_destroy") IMPORT TYPE(C_PTR), VALUE :: event_ptr INTEGER(KIND=C_INT) :: istat @@ -57,7 +57,7 @@ END FUNCTION acc_interface_event_destroy END INTERFACE INTERFACE - FUNCTION acc_interface_event_query(event_ptr, has_occurred) RESULT(istat) BIND(C, name="acc_event_query") + FUNCTION acc_interface_event_query(event_ptr, has_occurred) RESULT(istat) BIND(C, name="c_dbcsr_acc_event_query") IMPORT TYPE(C_PTR), VALUE :: event_ptr INTEGER(KIND=C_INT) :: has_occurred, istat @@ -66,7 +66,7 @@ END FUNCTION acc_interface_event_query END INTERFACE INTERFACE - FUNCTION acc_interface_event_record(event_ptr, stream_ptr) RESULT(istat) BIND(C, name="acc_event_record") + FUNCTION acc_interface_event_record(event_ptr, stream_ptr) RESULT(istat) BIND(C, name="c_dbcsr_acc_event_record") IMPORT TYPE(C_PTR), VALUE :: event_ptr, stream_ptr INTEGER(KIND=C_INT) :: istat @@ -75,7 +75,7 @@ END FUNCTION acc_interface_event_record END INTERFACE INTERFACE - FUNCTION acc_interface_stream_wait_event(stream_ptr, event_ptr) RESULT(istat) BIND(C, name="acc_stream_wait_event") + FUNCTION acc_interface_stream_wait_event(stream_ptr, event_ptr) RESULT(istat) BIND(C, name="c_dbcsr_acc_stream_wait_event") IMPORT TYPE(C_PTR), VALUE :: stream_ptr, event_ptr INTEGER(KIND=C_INT) :: istat @@ -84,7 +84,7 @@ END FUNCTION acc_interface_stream_wait_event END INTERFACE INTERFACE - FUNCTION acc_interface_event_synchronize(event_ptr) RESULT(istat) BIND(C, name="acc_event_synchronize") + FUNCTION acc_interface_event_synchronize(event_ptr) RESULT(istat) BIND(C, name="c_dbcsr_acc_event_synchronize") IMPORT TYPE(C_PTR), VALUE :: event_ptr INTEGER(KIND=C_INT) :: istat diff --git a/src/acc/dbcsr_acc_hostmem.F b/src/acc/dbcsr_acc_hostmem.F index f0d8a7457f8..c4e2d4f4874 100644 --- a/src/acc/dbcsr_acc_hostmem.F +++ b/src/acc/dbcsr_acc_hostmem.F @@ -56,7 +56,7 @@ MODULE dbcsr_acc_hostmem #if defined (__DBCSR_ACC) INTERFACE - FUNCTION acc_interface_host_mem_alloc(mem, n, stream_ptr) RESULT(istat) BIND(C, name="acc_host_mem_allocate") + FUNCTION acc_interface_host_mem_alloc(mem, n, stream_ptr) RESULT(istat) BIND(C, name="c_dbcsr_acc_host_mem_allocate") IMPORT TYPE(C_PTR) :: mem INTEGER(KIND=C_SIZE_T), INTENT(IN), & @@ -68,7 +68,7 @@ END FUNCTION acc_interface_host_mem_alloc END INTERFACE INTERFACE - FUNCTION acc_interface_host_mem_dealloc(mem, stream_ptr) RESULT(istat) bind(C, name="acc_host_mem_deallocate") + FUNCTION acc_interface_host_mem_dealloc(mem, stream_ptr) RESULT(istat) BIND(C, name="c_dbcsr_acc_host_mem_deallocate") IMPORT TYPE(C_PTR), VALUE :: mem, stream_ptr INTEGER(KIND=C_INT) :: istat diff --git a/src/acc/dbcsr_acc_init.F b/src/acc/dbcsr_acc_init.F index 99ff27b0138..fe7d6725273 100644 --- a/src/acc/dbcsr_acc_init.F +++ b/src/acc/dbcsr_acc_init.F @@ -25,14 +25,14 @@ MODULE dbcsr_acc_init #if defined (__DBCSR_ACC) INTERFACE - FUNCTION acc_interface_drv_init() RESULT(istat) BIND(C, name="acc_init") + FUNCTION acc_interface_drv_init() RESULT(istat) BIND(C, name="c_dbcsr_acc_init") IMPORT INTEGER(KIND=C_INT) :: istat END FUNCTION acc_interface_drv_init END INTERFACE INTERFACE - FUNCTION acc_interface_drv_finalize() RESULT(istat) BIND(C, name="acc_finalize") + FUNCTION acc_interface_drv_finalize() RESULT(istat) BIND(C, name="c_dbcsr_acc_finalize") IMPORT INTEGER(KIND=C_INT) :: istat END FUNCTION acc_interface_drv_finalize diff --git a/src/acc/dbcsr_acc_stream.F b/src/acc/dbcsr_acc_stream.F index efe2731621d..04138edf55b 100644 --- a/src/acc/dbcsr_acc_stream.F +++ b/src/acc/dbcsr_acc_stream.F @@ -39,7 +39,7 @@ MODULE dbcsr_acc_stream #if defined (__DBCSR_ACC) INTERFACE - FUNCTION acc_interface_stream_create(stream_ptr, name, priority) RESULT(istat) BIND(C, name="acc_stream_create") + FUNCTION acc_interface_stream_create(stream_ptr, name, priority) RESULT(istat) BIND(C, name="c_dbcsr_acc_stream_create") IMPORT TYPE(C_PTR) :: stream_ptr CHARACTER(KIND=C_CHAR), DIMENSION(*) :: name @@ -50,7 +50,7 @@ END FUNCTION acc_interface_stream_create END INTERFACE INTERFACE - FUNCTION acc_interface_stream_priority_range(least, greatest) RESULT(istat) BIND(C, name="acc_stream_priority_range") + FUNCTION acc_interface_stream_priority_range(least, greatest) RESULT(istat) BIND(C, name="c_dbcsr_acc_stream_priority_range") IMPORT INTEGER(KIND=C_INT) :: least, greatest, istat @@ -58,7 +58,7 @@ END FUNCTION acc_interface_stream_priority_range END INTERFACE INTERFACE - FUNCTION acc_interface_stream_destroy(stream_ptr) RESULT(istat) BIND(C, name="acc_stream_destroy") + FUNCTION acc_interface_stream_destroy(stream_ptr) RESULT(istat) BIND(C, name="c_dbcsr_acc_stream_destroy") IMPORT TYPE(C_PTR), VALUE :: stream_ptr INTEGER(KIND=C_INT) :: istat @@ -67,7 +67,7 @@ END FUNCTION acc_interface_stream_destroy END INTERFACE INTERFACE - FUNCTION acc_interface_stream_sync(stream_ptr) RESULT(istat) BIND(C, name="acc_stream_sync") + FUNCTION acc_interface_stream_sync(stream_ptr) RESULT(istat) BIND(C, name="c_dbcsr_acc_stream_sync") IMPORT TYPE(C_PTR), VALUE :: stream_ptr INTEGER(KIND=C_INT) :: istat diff --git a/src/acc/dbcsr_acc_timings.F b/src/acc/dbcsr_acc_timings.F index c87330f5efa..da3f011996f 100644 --- a/src/acc/dbcsr_acc_timings.F +++ b/src/acc/dbcsr_acc_timings.F @@ -20,7 +20,7 @@ MODULE dbcsr_acc_timings CONTAINS - SUBROUTINE dbcsr_timeset_F(routineN, routineN_len, handle) BIND(C, name="dbcsr_timeset") + SUBROUTINE dbcsr_timeset_F(routineN, routineN_len, handle) BIND(C, name="c_dbcsr_timeset") TYPE(C_PTR), INTENT(IN) :: routineN INTEGER(KIND=C_INT), INTENT(IN) :: routineN_len @@ -47,7 +47,7 @@ SUBROUTINE dbcsr_timeset_F(routineN, routineN_len, handle) BIND(C, name="dbcsr_t END SUBROUTINE dbcsr_timeset_F - SUBROUTINE dbcsr_timestop_F(handle) BIND(C, name="dbcsr_timestop") + SUBROUTINE dbcsr_timestop_F(handle) BIND(C, name="c_dbcsr_timestop") INTEGER(KIND=C_INT), INTENT(IN) :: handle diff --git a/src/acc/hip/dbcsr_hipblas.F b/src/acc/hip/dbcsr_hipblas.F index be6f5a53f6f..3ef9af4e7cc 100644 --- a/src/acc/hip/dbcsr_hipblas.F +++ b/src/acc/hip/dbcsr_hipblas.F @@ -36,7 +36,7 @@ MODULE dbcsr_hipblas INTERFACE FUNCTION hipblas_create_hip(handle) & RESULT(istat) & - BIND(C, name="hipblas_create") + BIND(C, name="hipblasCreate") IMPORT TYPE(C_PTR), INTENT(IN) :: handle INTEGER(KIND=C_INT) :: istat @@ -44,7 +44,7 @@ END FUNCTION hipblas_create_hip FUNCTION hipblas_destroy_hip(handle) & RESULT(istat) & - BIND(C, name="hipblas_destroy") + BIND(C, name="hipblasDestroy") IMPORT TYPE(C_PTR), INTENT(IN), VALUE :: handle INTEGER(KIND=C_INT) :: istat @@ -54,7 +54,7 @@ FUNCTION hipblas_dgemm_hip(handle, transa, transb, & m, n, k, a_offset, b_offset, c_offset, & a_data, b_data, c_data, alpha, beta, stream) & RESULT(istat) & - BIND(C, name="hipblas_dgemm") + BIND(C, name="hipblasDgemm") IMPORT TYPE(C_PTR), INTENT(IN), VALUE :: handle CHARACTER(KIND=C_CHAR), INTENT(IN), VALUE :: transa, transb diff --git a/src/acc/libsmm_acc/CMakeLists.txt b/src/acc/libsmm_acc/CMakeLists.txt index 71a73f284c2..1e39d45b264 100644 --- a/src/acc/libsmm_acc/CMakeLists.txt +++ b/src/acc/libsmm_acc/CMakeLists.txt @@ -7,94 +7,29 @@ set(SMM_ACC_KERNELS kernels/smm_acc_dnt_tiny.h kernels/smm_acc_transpose.h) -set(LIBSMM_ACC_SRC_FILES libsmm_acc.cpp libsmm_acc_benchmark.cpp - libsmm_acc_init.cpp) - -set(LIBSMM_ACC_HEADER_FILES - libsmm_acc.h libsmm_acc_benchmark.h libsmm_acc_init.h parameters.h - parameters_utils.h smm_acc_kernels.h) - -set(LIBSMM_ACC_FILES ${LIBSMM_ACC_SRC_FILES} ${LIBSMM_ACC_HEADER_FILES}) - -set(DBCSR_TIMING_SRCS_FTN - ../dbcsr_acc_timings.F ../../base/dbcsr_base_hooks.F - ../../base/dbcsr_machine.F ../../base/dbcsr_kinds.F - ../../base/dbcsr_machine_internal.F) - -add_custom_command( +add_custom_target( + parameters ALL COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate_parameters.py --gpu_version=${WITH_GPU} --base_dir=${CMAKE_CURRENT_SOURCE_DIR}/parameters DEPENDS generate_parameters.py parameters/parameters_${WITH_GPU}.json - OUTPUT parameters.h + BYPRODUCTS parameters.h COMMENT "libsmm_acc: generating parameters for GPU ${WITH_GPU}") -add_custom_command( +add_custom_target( + smm_acc_kernels ALL COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate_kernels.py ${CMAKE_CURRENT_SOURCE_DIR}/kernels DEPENDS generate_kernels.py ${SMM_ACC_KERNELS} - OUTPUT smm_acc_kernels.h + BYPRODUCTS smm_acc_kernels.h COMMENT "libsmm_acc: generating kernels") -if (USE_ACCEL MATCHES "cuda") - - add_library(libsmm_acc OBJECT ${LIBSMM_ACC_FILES}) - target_compile_definitions(libsmm_acc PRIVATE __CUDA) - target_compile_definitions(libsmm_acc PRIVATE ARCH_NUMBER=${ACC_ARCH_NUMBER}) - target_compile_definitions( - libsmm_acc PRIVATE $<$:__CUDA_PROFILING>) - -elseif (USE_ACCEL MATCHES "hip") - - set_source_files_properties(${LIBSMM_ACC_SRC_FILES} - PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) - # hip_add_library does not support OBJECT libraries. For this reason, we are - # forced to use a static library - hip_add_library(libsmm_acc STATIC ${LIBSMM_ACC_FILES}) - target_include_directories(libsmm_acc PRIVATE ${HIP_PATH}/../include) - target_link_libraries(libsmm_acc INTERFACE "stdc++") - target_link_libraries(libsmm_acc PUBLIC ${HIPBLAS}) - if (USE_OPENMP) - # since HIP is based on clang, not GCC, doing: - # target_link_libraries(libsmm_acc PUBLIC OpenMP::OpenMP_CXX) does not work - # - it links to GNU's OpenMP (libgomp.so) and not clang's libomp.so - target_link_libraries(libsmm_acc PUBLIC ${HIP_OpenMP_FLAGS}) - endif () - - # libsmm_acc calls timing functions - since we're compiling libsmm_acc as a - # static library, we have to link - add_library(dbcsr_timings OBJECT ${DBCSR_TIMING_SRCS_FTN}) - target_include_directories(dbcsr_timings PRIVATE "${CMAKE_SOURCE_DIR}/src") - target_compile_definitions(dbcsr_timings PRIVATE __DBCSR_ACC) - target_sources(libsmm_acc PRIVATE $) - - # Workaround issue in hip_add_library: explicitely write dependency between - # the generation of header files and the compilation of the libsmm_acc files - add_custom_target(generate_smm_acc_kernels_h DEPENDS smm_acc_kernels.h) - add_dependencies(libsmm_acc generate_smm_acc_kernels_h) - add_custom_target(generate_parameters_h DEPENDS parameters.h) - add_dependencies(libsmm_acc generate_parameters_h) - - install( - TARGETS libsmm_acc - EXPORT libsmm_accTargets - LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}" - ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}") - target_compile_definitions(libsmm_acc PRIVATE __HIP) - target_compile_definitions(libsmm_acc PRIVATE ARCH_NUMBER=${ACC_ARCH_NUMBER}) - -endif () - -if (OpenMP_FOUND) - # with CMake 3.12+ the following can be replaced by: - # target_link_libraries(libcusmm PRIVATE OpenMP::OpenMP_CXX) - target_compile_options( - libsmm_acc - PRIVATE $) -endif () - -target_include_directories(libsmm_acc PRIVATE ${CMAKE_CURRENT_BINARY_DIR} - ${CMAKE_CURRENT_SOURCE_DIR}) +add_dependencies(dbcsr smm_acc_kernels parameters) +target_include_directories(dbcsr PRIVATE ${CMAKE_CURRENT_BINARY_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}) -target_include_directories(libsmm_acc SYSTEM - PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) +# Note: this library is only used in some of the tests, it's just to get include +# paths to generated header files. +add_library(libsmm_acc INTERFACE) +target_include_directories(libsmm_acc INTERFACE ${CMAKE_CURRENT_BINARY_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/src/acc/libsmm_acc/kernels/smm_acc_common.h b/src/acc/libsmm_acc/kernels/smm_acc_common.h index 766b8c1055e..2c93053c7f0 100644 --- a/src/acc/libsmm_acc/kernels/smm_acc_common.h +++ b/src/acc/libsmm_acc/kernels/smm_acc_common.h @@ -7,6 +7,12 @@ * SPDX-License-Identifier: GPL-2.0+ * *------------------------------------------------------------------------------------------------*/ +// work around an issue where -D flags are not propagated in hiprtcCompileProgram (tested on 3.9.0) +#if defined(__HIP_ROCclr__) +#if !defined(__HIP) +#define __HIP +#endif +#endif #if defined(__HIP) && !defined(__HIP_PLATFORM_NVCC__) # include #endif diff --git a/src/acc/libsmm_acc/libsmm_acc.cpp b/src/acc/libsmm_acc/libsmm_acc.cpp index 2663cb91068..5f0ab558dcc 100644 --- a/src/acc/libsmm_acc/libsmm_acc.cpp +++ b/src/acc/libsmm_acc/libsmm_acc.cpp @@ -28,17 +28,15 @@ // MACRO HELPERS #define STRINGIFY_NX(x) #x #define STRINGIFY(x) STRINGIFY_NX(x) -#define CONCAT_NX(A, B) A ## B -#define CONCAT(A, B) CONCAT_NX(A, B) // The macro ARCH_OPTION, when expanded, is a string literal containing the // jit compiler option specifying the target architecture #if defined(__CUDA) || defined(__HIP_PLATFORM_NVCC__) -#define ARCH_OPTION_NAME --gpu-architecture=compute_ +#define ARCH_OPTION_NAME "--gpu-architecture=compute_" #else -#define ARCH_OPTION_NAME --amdgpu-target= +#define ARCH_OPTION_NAME "--gpu-architecture=" #endif -#define ARCH_OPTION STRINGIFY(CONCAT(ARCH_OPTION_NAME, ARCH_NUMBER)) +#define ARCH_OPTION ARCH_OPTION_NAME STRINGIFY(ARCH_NUMBER) //=========================================================================== @@ -144,7 +142,7 @@ inline void jit_kernel(ACC_DRV(function)& kern_func, libsmm_acc_algo algo, int t // Create JIT program ACC_RTC(Program) kernel_program; - ACC_RTC_CALL(CreateProgram, (&kernel_program, kernel_code.c_str(), "smm_acc_kernel.cpp", 0, NULL, NULL)); + ACC_RTC_CALL(CreateProgram, (&kernel_program, kernel_code.c_str(), "smm_acc_kernel.cu", 0, NULL, NULL)); // Add lowered name ACC_RTC_CALL(AddNameExpression, (kernel_program, kernel_name.c_str())); @@ -154,8 +152,8 @@ inline void jit_kernel(ACC_DRV(function)& kern_func, libsmm_acc_algo algo, int t const char *compileOptions[] = {"-D__CUDA", "-w", ARCH_OPTION}; size_t nOptions = 3; #else - const char *compileOptions[] = {"-D__HIP"}; - size_t nOptions = 1; + const char *compileOptions[] = {"-D__HIP", ARCH_OPTION}; + size_t nOptions = 2; #endif ACC_RTC_CALL(CompileProgram, (kernel_program, nOptions, compileOptions)); @@ -355,7 +353,7 @@ void jit_transpose_handle(ACC_DRV(function)& kern_func, int m, int n){ // Create nvrtcProgram ACC_RTC(Program) kernel_program; std::string transpose_code = smm_acc_common + smm_acc_transpose; - ACC_RTC_CALL(CreateProgram, (&kernel_program, transpose_code.c_str(), "transpose_kernel.cpp", 0, NULL, NULL)); + ACC_RTC_CALL(CreateProgram, (&kernel_program, transpose_code.c_str(), "transpose_kernel.cu", 0, NULL, NULL)); // Add lowered name std::string kernel_name = "transpose_d<" + std::to_string(m) + ", " + std::to_string(n) + ">"; @@ -366,8 +364,8 @@ void jit_transpose_handle(ACC_DRV(function)& kern_func, int m, int n){ const char *compileOptions[] = {"-D__CUDA", "-w", ARCH_OPTION}; size_t nOptions = 3; #else - const char *compileOptions[] = {"-D__HIP"}; - size_t nOptions = 1; + const char *compileOptions[] = {"-D__HIP", ARCH_OPTION}; + size_t nOptions = 2; #endif ACC_RTC_CALL(CompileProgram, (kernel_program, nOptions, compileOptions)); diff --git a/src/acc/libsmm_acc/libsmm_acc_init.cpp b/src/acc/libsmm_acc/libsmm_acc_init.cpp index 881f3b47b3e..27ae85b722e 100644 --- a/src/acc/libsmm_acc/libsmm_acc_init.cpp +++ b/src/acc/libsmm_acc/libsmm_acc_init.cpp @@ -23,11 +23,11 @@ std::vector acc_blashandles; void timeset(const std::string& routine_name, int& handle){ const char* routine_name_ = routine_name.c_str(); int routine_name_length = routine_name.length(); - dbcsr_timeset(&routine_name_, &routine_name_length, &handle); + c_dbcsr_timeset(&routine_name_, &routine_name_length, &handle); } void timestop(int handle){ - dbcsr_timestop(&handle); + c_dbcsr_timestop(&handle); } #endif diff --git a/src/acc/libsmm_acc/libsmm_acc_init.h b/src/acc/libsmm_acc/libsmm_acc_init.h index 17f4f21a7b3..0565a23ceb0 100644 --- a/src/acc/libsmm_acc/libsmm_acc_init.h +++ b/src/acc/libsmm_acc/libsmm_acc_init.h @@ -16,10 +16,10 @@ #include #if !defined(NO_DBCSR_TIMESET) -extern "C" void dbcsr_timeset(const char** routineN, int* routineN_len, int* handle); +extern "C" void c_dbcsr_timeset(const char** routineN, int* routineN_len, int* handle); void timeset(const std::string& routine_name, int& handle); -extern "C" void dbcsr_timestop(int* handle); +extern "C" void c_dbcsr_timestop(int* handle); void timestop(int handle); #endif diff --git a/src/acc/opencl/Makefile b/src/acc/opencl/Makefile index fce57d38f70..d178cc1039d 100644 --- a/src/acc/opencl/Makefile +++ b/src/acc/opencl/Makefile @@ -127,7 +127,7 @@ acc_bench_trans.o: ../acc_bench_trans.c Makefile $(CC) $^ $(LDFLAGS) -o $@ dbcsr_acc_test.o: ../../../tests/dbcsr_acc_test.c Makefile - $(CC) $(CFLAGS) -c $< -o $@ + $(CC) $(CFLAGS) -I../.. -c $< -o $@ ../dbcsr_acc_test: dbcsr_acc_test.o ../dbcsr_acc.a $(CC) $^ $(LDFLAGS) -o $@ diff --git a/src/acc/opencl/acc_opencl.c b/src/acc/opencl/acc_opencl.c index 9086047a029..88553be672f 100644 --- a/src/acc/opencl/acc_opencl.c +++ b/src/acc/opencl/acc_opencl.c @@ -52,7 +52,7 @@ void acc_opencl_notify(const char* errinfo, const void* private_info, size_t cb, #endif -const char* acc_opencl_stristr(const char* a, const char* b) +const char* c_dbcsr_acc_opencl_stristr(const char* a, const char* b) { const char* result = NULL; if (NULL != a && NULL != b && '\0' != *a && '\0' != *b) { @@ -81,8 +81,8 @@ const char* acc_opencl_stristr(const char* a, const char* b) /* comparator used with qsort; stabilized by tail condition (a < b ? -1 : 1) */ -int acc_opencl_order_devices(const void* /*dev_a*/, const void* /*dev_b*/); -int acc_opencl_order_devices(const void* dev_a, const void* dev_b) +int c_dbcsr_acc_opencl_order_devices(const void* /*dev_a*/, const void* /*dev_b*/); +int c_dbcsr_acc_opencl_order_devices(const void* dev_a, const void* dev_b) { const cl_device_id *const a = (const cl_device_id*)dev_a; const cl_device_id *const b = (const cl_device_id*)dev_b; @@ -98,8 +98,8 @@ int acc_opencl_order_devices(const void* dev_a, const void* dev_b) if (CL_DEVICE_TYPE_GPU & type_a) { if (CL_DEVICE_TYPE_GPU & type_b) { size_t size_a, size_b; - ACC_OPENCL_EXPECT(EXIT_SUCCESS, acc_opencl_info_devmem(*a, NULL, &size_a)); - ACC_OPENCL_EXPECT(EXIT_SUCCESS, acc_opencl_info_devmem(*b, NULL, &size_b)); + ACC_OPENCL_EXPECT(EXIT_SUCCESS, c_dbcsr_acc_opencl_info_devmem(*a, NULL, &size_a)); + ACC_OPENCL_EXPECT(EXIT_SUCCESS, c_dbcsr_acc_opencl_info_devmem(*b, NULL, &size_b)); return (size_a < size_b ? 1 : (size_a != size_b ? -1 : (a < b ? -1 : 1))); } else return -1; @@ -109,8 +109,8 @@ int acc_opencl_order_devices(const void* dev_a, const void* dev_b) if (CL_DEVICE_TYPE_ACCELERATOR & type_a) { if (CL_DEVICE_TYPE_ACCELERATOR & type_b) { size_t size_a, size_b; - ACC_OPENCL_EXPECT(EXIT_SUCCESS, acc_opencl_info_devmem(*a, NULL, &size_a)); - ACC_OPENCL_EXPECT(EXIT_SUCCESS, acc_opencl_info_devmem(*b, NULL, &size_b)); + ACC_OPENCL_EXPECT(EXIT_SUCCESS, c_dbcsr_acc_opencl_info_devmem(*a, NULL, &size_a)); + ACC_OPENCL_EXPECT(EXIT_SUCCESS, c_dbcsr_acc_opencl_info_devmem(*b, NULL, &size_b)); return (size_a < size_b ? 1 : (size_a != size_b ? -1 : (a < b ? -1 : 1))); } else return -1; @@ -118,8 +118,8 @@ int acc_opencl_order_devices(const void* dev_a, const void* dev_b) else if (CL_DEVICE_TYPE_ACCELERATOR & type_b) return 1; else { size_t size_a, size_b; - ACC_OPENCL_EXPECT(EXIT_SUCCESS, acc_opencl_info_devmem(*a, NULL, &size_a)); - ACC_OPENCL_EXPECT(EXIT_SUCCESS, acc_opencl_info_devmem(*b, NULL, &size_b)); + ACC_OPENCL_EXPECT(EXIT_SUCCESS, c_dbcsr_acc_opencl_info_devmem(*a, NULL, &size_a)); + ACC_OPENCL_EXPECT(EXIT_SUCCESS, c_dbcsr_acc_opencl_info_devmem(*b, NULL, &size_b)); return (size_a < size_b ? 1 : (size_a != size_b ? -1 : (a < b ? -1 : 1))); } } @@ -127,7 +127,7 @@ int acc_opencl_order_devices(const void* dev_a, const void* dev_b) } -int acc_init(void) +int c_dbcsr_acc_init(void) { #if defined(_OPENMP) /* initialization/finalization is not meant to be thread-safe */ @@ -152,8 +152,8 @@ int acc_init(void) nplatforms <= ACC_OPENCL_DEVICES_MAXCOUNT ? nplatforms : ACC_OPENCL_DEVICES_MAXCOUNT, platforms, 0), "retrieve platform ids", result); if (NULL != env_device_type && '\0' != *env_device_type) { - if (NULL != acc_opencl_stristr(env_device_type, "gpu")) type = CL_DEVICE_TYPE_GPU; - else if (NULL != acc_opencl_stristr(env_device_type, "cpu")) type = CL_DEVICE_TYPE_CPU; + if (NULL != c_dbcsr_acc_opencl_stristr(env_device_type, "gpu")) type = CL_DEVICE_TYPE_GPU; + else if (NULL != c_dbcsr_acc_opencl_stristr(env_device_type, "cpu")) type = CL_DEVICE_TYPE_CPU; else type = CL_DEVICE_TYPE_ACCELERATOR; } acc_opencl_ndevices = 0; @@ -181,7 +181,7 @@ int acc_init(void) if (CL_SUCCESS == clGetDeviceInfo(acc_opencl_devices[i], CL_DEVICE_VENDOR, ACC_OPENCL_BUFFERSIZE, buffer, NULL)) { - if (NULL == acc_opencl_stristr(buffer, env_device_vendor)) { + if (NULL == c_dbcsr_acc_opencl_stristr(buffer, env_device_vendor)) { --acc_opencl_ndevices; if (i < (cl_uint)acc_opencl_ndevices) { /* keep relative order of IDs */ memmove(acc_opencl_devices + i, acc_opencl_devices + i + 1, @@ -201,7 +201,7 @@ int acc_init(void) if (EXIT_SUCCESS == result && 1 < acc_opencl_ndevices) { /* reorder devices according to acc_opencl_order_devices */ qsort(acc_opencl_devices, acc_opencl_ndevices, - sizeof(cl_device_id), acc_opencl_order_devices); + sizeof(cl_device_id), c_dbcsr_acc_opencl_order_devices); /* preselect default device */ if (NULL == env_device_id || '\0' == *env_device_id) { for (i = 0; i < (cl_uint)acc_opencl_ndevices; ++i) { @@ -217,7 +217,7 @@ int acc_init(void) } if (EXIT_SUCCESS == result) { cl_device_id active_device; - result = acc_opencl_set_active_device(device_id, &active_device); + result = c_dbcsr_acc_opencl_set_active_device(device_id, &active_device); #if defined(_OPENMP) && defined(ACC_OPENCL_THREADLOCAL_CONTEXT) if (EXIT_SUCCESS == result) { const cl_context context = acc_opencl_context; @@ -237,7 +237,7 @@ int acc_init(void) if (EXIT_SUCCESS == result) { const char *const env = getenv("ACC_OPENCL_ASYNC_MEMOPS"); if (NULL == env) { - const int confirmation = acc_opencl_device_vendor(active_device, "nvidia"); + const int confirmation = c_dbcsr_acc_opencl_device_vendor(active_device, "nvidia"); acc_opencl_options.async_memops = (EXIT_SUCCESS != confirmation); } else acc_opencl_options.async_memops = (0 != atoi(env)); @@ -280,7 +280,7 @@ int acc_init(void) } -int acc_finalize(void) +int c_dbcsr_acc_finalize(void) { #if defined(_OPENMP) /* initialization/finalization is not meant to be thread-safe */ @@ -317,18 +317,18 @@ int acc_finalize(void) } -void acc_clear_errors(void) +void c_dbcsr_acc_clear_errors(void) { } -int acc_get_ndevices(int* ndevices) +int c_dbcsr_acc_get_ndevices(int* ndevices) { int result; #if defined(__DBCSR_ACC) /* DBCSR calls acc_get_ndevices before calling acc_init(). */ - result = acc_init(); + result = c_dbcsr_acc_init(); if (EXIT_SUCCESS == result) #endif { @@ -344,7 +344,7 @@ int acc_get_ndevices(int* ndevices) } -int acc_opencl_device(void* stream, cl_device_id* device) +int c_dbcsr_acc_opencl_device(void* stream, cl_device_id* device) { int result = EXIT_SUCCESS; assert(NULL != device); @@ -370,7 +370,7 @@ int acc_opencl_device(void* stream, cl_device_id* device) } -int acc_opencl_device_vendor(cl_device_id device, const char* vendor) +int c_dbcsr_acc_opencl_device_vendor(cl_device_id device, const char* vendor) { char buffer[ACC_OPENCL_BUFFERSIZE]; int result = EXIT_SUCCESS; @@ -380,7 +380,7 @@ int acc_opencl_device_vendor(cl_device_id device, const char* vendor) CL_DEVICE_VENDOR, ACC_OPENCL_BUFFERSIZE, buffer, NULL), "retrieve device vendor", result); if (EXIT_SUCCESS == result) { - return (NULL != acc_opencl_stristr(buffer, vendor) + return (NULL != c_dbcsr_acc_opencl_stristr(buffer, vendor) ? EXIT_SUCCESS : EXIT_FAILURE); } @@ -388,7 +388,7 @@ int acc_opencl_device_vendor(cl_device_id device, const char* vendor) } -int acc_opencl_device_level(cl_device_id device, int* level_major, int* level_minor) +int c_dbcsr_acc_opencl_device_level(cl_device_id device, int* level_major, int* level_minor) { char buffer[ACC_OPENCL_BUFFERSIZE]; int result = EXIT_SUCCESS; @@ -411,7 +411,7 @@ int acc_opencl_device_level(cl_device_id device, int* level_major, int* level_mi } -int acc_opencl_device_ext(cl_device_id device, const char *const extnames[], int num_exts) +int c_dbcsr_acc_opencl_device_ext(cl_device_id device, const char *const extnames[], int num_exts) { int result = ((NULL != extnames && 0 < num_exts) ? EXIT_SUCCESS : EXIT_FAILURE); char extensions[ACC_OPENCL_BUFFERSIZE], buffer[ACC_OPENCL_BUFFERSIZE]; @@ -440,7 +440,7 @@ int acc_opencl_device_ext(cl_device_id device, const char *const extnames[], int } -int acc_opencl_set_active_device(int device_id, cl_device_id* device) +int c_dbcsr_acc_opencl_set_active_device(int device_id, cl_device_id* device) { cl_int result = (((0 <= device_id && device_id < acc_opencl_ndevices) || /* allow successful completion if no device was found */ @@ -448,7 +448,7 @@ int acc_opencl_set_active_device(int device_id, cl_device_id* device) if (0 < acc_opencl_ndevices) { const cl_device_id active_id = acc_opencl_devices[device_id]; cl_device_id current_id = NULL; - if (EXIT_SUCCESS == result) result = acc_opencl_device(NULL/*stream*/, ¤t_id); + if (EXIT_SUCCESS == result) result = c_dbcsr_acc_opencl_device(NULL/*stream*/, ¤t_id); if (EXIT_SUCCESS == result && active_id != current_id) { if (NULL != acc_opencl_context) { ACC_OPENCL_CHECK(clReleaseContext(acc_opencl_context), @@ -485,13 +485,13 @@ int acc_opencl_set_active_device(int device_id, cl_device_id* device) } -int acc_set_active_device(int device_id) +int c_dbcsr_acc_set_active_device(int device_id) { - ACC_OPENCL_RETURN(acc_opencl_set_active_device(device_id, NULL/*device*/)); + ACC_OPENCL_RETURN(c_dbcsr_acc_opencl_set_active_device(device_id, NULL/*device*/)); } -int acc_opencl_wgsize(cl_device_id device, cl_kernel kernel, +int c_dbcsr_acc_opencl_wgsize(cl_device_id device, cl_kernel kernel, int* max_value, int* preferred_multiple) { int result = (NULL != device && (NULL != preferred_multiple @@ -543,7 +543,7 @@ int acc_opencl_wgsize(cl_device_id device, cl_kernel kernel, } -int acc_opencl_kernel(const char* source, const char* build_options, +int c_dbcsr_acc_opencl_kernel(const char* source, const char* build_options, const char* kernel_name, cl_kernel* kernel) { char buffer[ACC_OPENCL_BUFFERSIZE] = "\0"; @@ -555,7 +555,7 @@ int acc_opencl_kernel(const char* source, const char* build_options, if (NULL != program) { cl_device_id active_id = NULL; assert(CL_SUCCESS == result); - result = acc_opencl_device(NULL/*stream*/, &active_id); + result = c_dbcsr_acc_opencl_device(NULL/*stream*/, &active_id); if (EXIT_SUCCESS == result) { result = clBuildProgram(program, 1/*num_devices*/, &active_id, build_options, diff --git a/src/acc/opencl/acc_opencl.h b/src/acc/opencl/acc_opencl.h index f0c0988c8b9..6c8f5214a14 100644 --- a/src/acc/opencl/acc_opencl.h +++ b/src/acc/opencl/acc_opencl.h @@ -204,40 +204,40 @@ extern cl_context acc_opencl_context; # pragma omp threadprivate(acc_opencl_context) #endif -typedef struct acc_opencl_info_hostptr_t { +typedef struct c_dbcsr_acc_opencl_info_hostptr_t { cl_mem buffer; void* mapped; -} acc_opencl_info_hostptr_t; +} c_dbcsr_acc_opencl_info_hostptr_t; /** Information about host-memory pointer (acc_host_mem_allocate). */ -acc_opencl_info_hostptr_t* acc_opencl_info_hostptr(void* memory); +c_dbcsr_acc_opencl_info_hostptr_t* c_dbcsr_acc_opencl_info_hostptr(void* memory); /** Get host-pointer associated with device-memory (acc_dev_mem_allocate). */ -void* acc_opencl_get_hostptr(cl_mem memory); +void* c_dbcsr_acc_opencl_get_hostptr(cl_mem memory); /** Information about amount of device memory. */ -int acc_opencl_info_devmem(cl_device_id device, +int c_dbcsr_acc_opencl_info_devmem(cl_device_id device, size_t* mem_free, size_t* mem_total); /** Return the pointer to the 1st match of "b" in "a", or NULL (no match). */ -const char* acc_opencl_stristr(const char* a, const char* b); +const char* c_dbcsr_acc_opencl_stristr(const char* a, const char* b); /** Get active device (can be thread/queue-specific). */ -int acc_opencl_device(void* stream, cl_device_id* device); +int c_dbcsr_acc_opencl_device(void* stream, cl_device_id* device); /** Confirm the vendor of the given device. */ -int acc_opencl_device_vendor(cl_device_id device, const char* vendor); +int c_dbcsr_acc_opencl_device_vendor(cl_device_id device, const char* vendor); /** Return the OpenCL support level for the given device. */ -int acc_opencl_device_level(cl_device_id device, +int c_dbcsr_acc_opencl_device_level(cl_device_id device, int* level_major, int* level_minor); /** Check if given device supports the extensions. */ -int acc_opencl_device_ext(cl_device_id device, +int c_dbcsr_acc_opencl_device_ext(cl_device_id device, const char *const extnames[], int num_exts); /** Internal flavor of acc_set_active_device; yields cl_device_id. */ -int acc_opencl_set_active_device(int device_id, cl_device_id* device); +int c_dbcsr_acc_opencl_set_active_device(int device_id, cl_device_id* device); /** Get preferred multiple and max. size of workgroup (kernel- or device-specific). */ -int acc_opencl_wgsize(cl_device_id device, cl_kernel kernel, +int c_dbcsr_acc_opencl_wgsize(cl_device_id device, cl_kernel kernel, int* max_value, int* preferred_multiple); /** Build kernel function with given name from source using given build_options. */ -int acc_opencl_kernel(const char* source, const char* build_options, +int c_dbcsr_acc_opencl_kernel(const char* source, const char* build_options, const char* kernel_name, cl_kernel* kernel); /** Create command queue (stream). */ -int acc_opencl_stream_create(cl_command_queue* stream_p, const char* name, +int c_dbcsr_acc_opencl_stream_create(cl_command_queue* stream_p, const char* name, const ACC_OPENCL_COMMAND_QUEUE_PROPERTIES* properties); #if defined(__cplusplus) diff --git a/src/acc/opencl/acc_opencl_event.c b/src/acc/opencl/acc_opencl_event.c index 635430ebb52..fd59a69611d 100644 --- a/src/acc/opencl/acc_opencl_event.c +++ b/src/acc/opencl/acc_opencl_event.c @@ -26,7 +26,7 @@ extern "C" { #endif -int acc_event_create(void** event_p) +int c_dbcsr_acc_event_create(void** event_p) { cl_int result = EXIT_SUCCESS; const cl_event event = clCreateUserEvent(acc_opencl_context, &result); @@ -68,7 +68,7 @@ int acc_event_create(void** event_p) } -int acc_event_destroy(void* event) +int c_dbcsr_acc_event_destroy(void* event) { int result = EXIT_SUCCESS; if (NULL != event) { @@ -84,7 +84,7 @@ int acc_event_destroy(void* event) } -int acc_event_record(void* event, void* stream) +int c_dbcsr_acc_event_record(void* event, void* stream) { int result = EXIT_SUCCESS; assert(NULL != event && NULL != stream); @@ -94,7 +94,7 @@ int acc_event_record(void* event, void* stream) } -int acc_event_query(void* event, acc_bool_t* has_occurred) +int c_dbcsr_acc_event_query(void* event, acc_bool_t* has_occurred) { int result = EXIT_SUCCESS; cl_int status = CL_COMPLETE; @@ -108,18 +108,18 @@ int acc_event_query(void* event, acc_bool_t* has_occurred) assert(NULL != has_occurred); *has_occurred = (CL_COMPLETE == status || 0 > status); #if defined(ACC_OPENCL_VERBOSE) && defined(_DEBUG) - fprintf(stderr, "acc_event_query(%p, %i)\n", event, *has_occurred); + fprintf(stderr, "c_dbcsr_acc_event_query(%p, %i)\n", event, *has_occurred); #endif ACC_OPENCL_RETURN(result); } -int acc_event_synchronize(void* event) +int c_dbcsr_acc_event_synchronize(void* event) { /* Waits on the host-side. */ int result = EXIT_SUCCESS; assert(NULL != event); #if defined(ACC_OPENCL_VERBOSE) && defined(_DEBUG) - fprintf(stderr, "acc_event_synchronize(%p)\n", event); + fprintf(stderr, "c_dbcsr_acc_event_synchronize(%p)\n", event); #endif ACC_OPENCL_CHECK(clWaitForEvents(1, ACC_OPENCL_EVENT(event)), "synchronize event", result); diff --git a/src/acc/opencl/acc_opencl_mem.c b/src/acc/opencl/acc_opencl_mem.c index ea4d89e7d8a..17c904121bc 100644 --- a/src/acc/opencl/acc_opencl_mem.c +++ b/src/acc/opencl/acc_opencl_mem.c @@ -34,8 +34,8 @@ extern "C" { #endif -int acc_opencl_memalignment(size_t /*size*/); -int acc_opencl_memalignment(size_t size) +int c_dbcsr_acc_opencl_memalignment(size_t /*size*/); +int c_dbcsr_acc_opencl_memalignment(size_t size) { int result; if ((ACC_OPENCL_MEM_ALIGNSCALE * ACC_OPENCL_MAXALIGN_NBYTES) <= size) { @@ -51,16 +51,16 @@ int acc_opencl_memalignment(size_t size) } -acc_opencl_info_hostptr_t* acc_opencl_info_hostptr(void* memory) +c_dbcsr_acc_opencl_info_hostptr_t* c_dbcsr_acc_opencl_info_hostptr(void* memory) { - assert(NULL == memory || sizeof(acc_opencl_info_hostptr_t) <= (uintptr_t)memory); + assert(NULL == memory || sizeof(c_dbcsr_acc_opencl_info_hostptr_t) <= (uintptr_t)memory); return (NULL != memory - ? (acc_opencl_info_hostptr_t*)((uintptr_t)memory - sizeof(acc_opencl_info_hostptr_t)) - : (acc_opencl_info_hostptr_t*)NULL); + ? (c_dbcsr_acc_opencl_info_hostptr_t*)((uintptr_t)memory - sizeof(c_dbcsr_acc_opencl_info_hostptr_t)) + : (c_dbcsr_acc_opencl_info_hostptr_t*)NULL); } -void* acc_opencl_get_hostptr(cl_mem memory) +void* c_dbcsr_acc_opencl_get_hostptr(cl_mem memory) { void* result = NULL; assert(acc_opencl_options.svm_interop); @@ -71,11 +71,11 @@ void* acc_opencl_get_hostptr(cl_mem memory) } -int acc_host_mem_allocate(void** host_mem, size_t nbytes, void* stream) +int c_dbcsr_acc_host_mem_allocate(void** host_mem, size_t nbytes, void* stream) { cl_int result; - const int alignment = acc_opencl_memalignment(nbytes); - const size_t size_meminfo = sizeof(acc_opencl_info_hostptr_t); + const int alignment = c_dbcsr_acc_opencl_memalignment(nbytes); + const size_t size_meminfo = sizeof(c_dbcsr_acc_opencl_info_hostptr_t); const size_t size = nbytes + alignment + size_meminfo - 1; const cl_mem buffer = ( #if defined(ACC_OPENCL_SVM) @@ -91,16 +91,16 @@ int acc_host_mem_allocate(void** host_mem, size_t nbytes, void* stream) 0/*offset*/, size, 0, NULL, NULL, &result); if (0 != address) { const uintptr_t aligned = ACC_OPENCL_UP2(address + size_meminfo, alignment); - acc_opencl_info_hostptr_t* meminfo; + c_dbcsr_acc_opencl_info_hostptr_t* meminfo; assert(address + size_meminfo <= aligned); assert(CL_SUCCESS == result); #if defined(ACC_OPENCL_MEM_MAPMULTI) assert(0 < aligned - address - size_meminfo); - meminfo = (acc_opencl_info_hostptr_t*)clEnqueueMapBuffer(queue, buffer, + meminfo = (c_dbcsr_acc_opencl_info_hostptr_t*)clEnqueueMapBuffer(queue, buffer, CL_TRUE/*blocking*/, CL_MAP_READ | CL_MAP_WRITE, aligned - address - size_meminfo, size_meminfo, 0, NULL, NULL, &result); #else - meminfo = (acc_opencl_info_hostptr_t*)(aligned - size_meminfo); + meminfo = (c_dbcsr_acc_opencl_info_hostptr_t*)(aligned - size_meminfo); #endif if (NULL != meminfo) { meminfo->buffer = buffer; @@ -127,13 +127,13 @@ int acc_host_mem_allocate(void** host_mem, size_t nbytes, void* stream) } -int acc_host_mem_deallocate(void* host_mem, void* stream) +int c_dbcsr_acc_host_mem_deallocate(void* host_mem, void* stream) { int result = EXIT_SUCCESS; assert(NULL != stream); if (NULL != host_mem) { - acc_opencl_info_hostptr_t *const meminfo = acc_opencl_info_hostptr(host_mem); - const acc_opencl_info_hostptr_t info = *meminfo; /* copy meminfo prior to unmap */ + c_dbcsr_acc_opencl_info_hostptr_t *const meminfo = c_dbcsr_acc_opencl_info_hostptr(host_mem); + const c_dbcsr_acc_opencl_info_hostptr_t info = *meminfo; /* copy meminfo prior to unmap */ const cl_command_queue queue = *ACC_OPENCL_STREAM(stream); if (NULL != meminfo->buffer) { #if defined(ACC_OPENCL_MEM_MAPMULTI) @@ -153,7 +153,7 @@ int acc_host_mem_deallocate(void* host_mem, void* stream) } -int acc_dev_mem_allocate(void** dev_mem, size_t nbytes) +int c_dbcsr_acc_dev_mem_allocate(void** dev_mem, size_t nbytes) { cl_int result; const cl_mem buffer = ( @@ -176,7 +176,7 @@ int acc_dev_mem_allocate(void** dev_mem, size_t nbytes) else { #if defined(ACC_OPENCL_SVM) void *const ptr = (acc_opencl_options.svm_interop - ? acc_opencl_get_hostptr(buffer) : NULL); + ? c_dbcsr_acc_opencl_get_hostptr(buffer) : NULL); #endif clReleaseMemObject(buffer); #if defined(ACC_OPENCL_SVM) @@ -195,14 +195,14 @@ int acc_dev_mem_allocate(void** dev_mem, size_t nbytes) } -int acc_dev_mem_deallocate(void* dev_mem) +int c_dbcsr_acc_dev_mem_deallocate(void* dev_mem) { int result = EXIT_SUCCESS; if (NULL != dev_mem) { const cl_mem buffer = *ACC_OPENCL_MEM(dev_mem); #if defined(ACC_OPENCL_SVM) void *const ptr = (acc_opencl_options.svm_interop - ? acc_opencl_get_hostptr(buffer) : NULL); + ? c_dbcsr_acc_opencl_get_hostptr(buffer) : NULL); #endif ACC_OPENCL_CHECK(clReleaseMemObject(buffer), "release device memory buffer", result); @@ -219,7 +219,7 @@ int acc_dev_mem_deallocate(void* dev_mem) } -int acc_dev_mem_set_ptr(void** dev_mem, void* other, size_t lb) +int c_dbcsr_acc_dev_mem_set_ptr(void** dev_mem, void* other, size_t lb) { int result; assert(NULL != dev_mem); @@ -232,7 +232,7 @@ int acc_dev_mem_set_ptr(void** dev_mem, void* other, size_t lb) } -int acc_memcpy_h2d(const void* host_mem, void* dev_mem, size_t nbytes, void* stream) +int c_dbcsr_acc_memcpy_h2d(const void* host_mem, void* dev_mem, size_t nbytes, void* stream) { int result = EXIT_SUCCESS; assert((NULL != host_mem || 0 == nbytes) && (NULL != dev_mem || 0 == nbytes) && NULL != stream); @@ -245,7 +245,7 @@ int acc_memcpy_h2d(const void* host_mem, void* dev_mem, size_t nbytes, void* str } -int acc_memcpy_d2h(const void* dev_mem, void* host_mem, size_t nbytes, void* stream) +int c_dbcsr_acc_memcpy_d2h(const void* dev_mem, void* host_mem, size_t nbytes, void* stream) { int result = EXIT_SUCCESS; assert((NULL != dev_mem || 0 == nbytes) && (NULL != host_mem || 0 == nbytes) && NULL != stream); @@ -258,7 +258,7 @@ int acc_memcpy_d2h(const void* dev_mem, void* host_mem, size_t nbytes, void* str } -int acc_memcpy_d2d(const void* devmem_src, void* devmem_dst, size_t nbytes, void* stream) +int c_dbcsr_acc_memcpy_d2d(const void* devmem_src, void* devmem_dst, size_t nbytes, void* stream) { int result = EXIT_SUCCESS; assert((NULL != devmem_src || 0 == nbytes) && (NULL != devmem_dst || 0 == nbytes) && NULL != stream); @@ -272,7 +272,7 @@ int acc_memcpy_d2d(const void* devmem_src, void* devmem_dst, size_t nbytes, void } -int acc_memset_zero(void* dev_mem, size_t offset, size_t nbytes, void* stream) +int c_dbcsr_acc_memset_zero(void* dev_mem, size_t offset, size_t nbytes, void* stream) { int result = EXIT_SUCCESS; assert((NULL != dev_mem || 0 == nbytes) && NULL != stream); @@ -286,7 +286,7 @@ int acc_memset_zero(void* dev_mem, size_t offset, size_t nbytes, void* stream) } -int acc_opencl_info_devmem(cl_device_id device, size_t* mem_free, size_t* mem_total) +int c_dbcsr_acc_opencl_info_devmem(cl_device_id device, size_t* mem_free, size_t* mem_total) { int result = EXIT_SUCCESS; size_t size_free = 0, size_total = 0; @@ -349,15 +349,15 @@ int acc_opencl_info_devmem(cl_device_id device, size_t* mem_free, size_t* mem_to } -int acc_dev_mem_info(size_t* mem_free, size_t* mem_total) +int c_dbcsr_acc_dev_mem_info(size_t* mem_free, size_t* mem_total) { int result = EXIT_SUCCESS; cl_device_id active_id = NULL; if (NULL != acc_opencl_context) { - result = acc_opencl_device(NULL/*stream*/, &active_id); + result = c_dbcsr_acc_opencl_device(NULL/*stream*/, &active_id); } if (EXIT_SUCCESS == result) { - result = acc_opencl_info_devmem(active_id, mem_free, mem_total); + result = c_dbcsr_acc_opencl_info_devmem(active_id, mem_free, mem_total); } ACC_OPENCL_RETURN(result); } diff --git a/src/acc/opencl/acc_opencl_stream.c b/src/acc/opencl/acc_opencl_stream.c index 8403eee79bb..be8825523b0 100644 --- a/src/acc/opencl/acc_opencl_stream.c +++ b/src/acc/opencl/acc_opencl_stream.c @@ -36,14 +36,14 @@ extern "C" { #endif -int acc_opencl_stream_create(cl_command_queue* stream_p, const char* name, +int c_dbcsr_acc_opencl_stream_create(cl_command_queue* stream_p, const char* name, const ACC_OPENCL_COMMAND_QUEUE_PROPERTIES* properties) { cl_int result = EXIT_SUCCESS; assert(NULL != stream_p); if (NULL != acc_opencl_context) { cl_device_id device_id = NULL; - result = acc_opencl_device(NULL/*stream*/, &device_id); + result = c_dbcsr_acc_opencl_device(NULL/*stream*/, &device_id); if (EXIT_SUCCESS == result) { *stream_p = ACC_OPENCL_CREATE_COMMAND_QUEUE(acc_opencl_context, device_id, properties, &result); } @@ -55,7 +55,7 @@ int acc_opencl_stream_create(cl_command_queue* stream_p, const char* name, } -int acc_stream_create(void** stream_p, const char* name, int priority) +int c_dbcsr_acc_stream_create(void** stream_p, const char* name, int priority) { cl_int result = EXIT_SUCCESS; if (NULL != acc_opencl_context) { @@ -70,7 +70,7 @@ int acc_stream_create(void** stream_p, const char* name, int priority) }; properties[1] = (CL_QUEUE_PRIORITY_HIGH_KHR <= priority && CL_QUEUE_PRIORITY_LOW_KHR >= priority) ? priority : ((CL_QUEUE_PRIORITY_HIGH_KHR + CL_QUEUE_PRIORITY_LOW_KHR) / 2); - result = acc_opencl_stream_create(&queue, name, properties); + result = c_dbcsr_acc_opencl_stream_create(&queue, name, properties); } else #endif @@ -78,7 +78,7 @@ int acc_stream_create(void** stream_p, const char* name, int priority) ACC_OPENCL_COMMAND_QUEUE_PROPERTIES properties[] = { 0 /* terminator */ }; - result = acc_opencl_stream_create(&queue, name, properties); + result = c_dbcsr_acc_opencl_stream_create(&queue, name, properties); } assert(NULL != stream_p); if (EXIT_SUCCESS == result) { @@ -105,7 +105,7 @@ int acc_stream_create(void** stream_p, const char* name, int priority) } -int acc_stream_destroy(void* stream) +int c_dbcsr_acc_stream_destroy(void* stream) { int result = EXIT_SUCCESS; if (NULL != stream) { @@ -121,7 +121,7 @@ int acc_stream_destroy(void* stream) } -int acc_stream_priority_range(int* least, int* greatest) +int c_dbcsr_acc_stream_priority_range(int* least, int* greatest) { int result = ((NULL != least || NULL != greatest) ? EXIT_SUCCESS : EXIT_FAILURE); if (NULL != acc_opencl_context) { @@ -130,7 +130,7 @@ int acc_stream_priority_range(int* least, int* greatest) cl_platform_id platform = NULL; cl_device_id active_id = NULL; assert(0 < acc_opencl_ndevices); - if (EXIT_SUCCESS == result) result = acc_opencl_device(NULL/*stream*/, &active_id); + if (EXIT_SUCCESS == result) result = c_dbcsr_acc_opencl_device(NULL/*stream*/, &active_id); ACC_OPENCL_CHECK(clGetDeviceInfo(active_id, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &platform, NULL), "retrieve platform associated with active device", result); @@ -161,12 +161,12 @@ int acc_stream_priority_range(int* least, int* greatest) } -int acc_stream_sync(void* stream) +int c_dbcsr_acc_stream_sync(void* stream) { /* Blocks the host-thread. */ int result = EXIT_SUCCESS; assert(NULL != stream); #if defined(ACC_OPENCL_VERBOSE) && defined(_DEBUG) - fprintf(stderr, "acc_stream_sync(%p)\n", stream); + fprintf(stderr, "c_dbcsr_acc_stream_sync(%p)\n", stream); #endif ACC_OPENCL_CHECK(clFinish(*ACC_OPENCL_STREAM(stream)), "synchronize stream", result); @@ -174,12 +174,12 @@ int acc_stream_sync(void* stream) } -int acc_stream_wait_event(void* stream, void* event) +int c_dbcsr_acc_stream_wait_event(void* stream, void* event) { /* Wait for an event (device-side). */ int result = EXIT_SUCCESS; assert(NULL != stream && NULL != event); #if defined(ACC_OPENCL_VERBOSE) && defined(_DEBUG) - fprintf(stderr, "acc_stream_wait_event(%p, %p)\n", stream, event); + fprintf(stderr, "c_dbcsr_acc_stream_wait_event(%p, %p)\n", stream, event); #endif #if defined(ACC_OPENCL_STREAM_SYNCFLUSH) ACC_OPENCL_CHECK(clFlush(*ACC_OPENCL_STREAM(stream)), "flush stream", result); diff --git a/src/acc/opencl/smm/CMakeLists.txt b/src/acc/opencl/smm/CMakeLists.txt index 37539399ba7..b7f65627ee1 100644 --- a/src/acc/opencl/smm/CMakeLists.txt +++ b/src/acc/opencl/smm/CMakeLists.txt @@ -1,30 +1,18 @@ -set(LIBSMM_ACC_SRC_FILES opencl_libsmm.c) - set(LIBSMM_ACC_HEADER_KERNELS ${CMAKE_CURRENT_SOURCE_DIR}/opencl_kernels.h) -set(LIBSMM_ACC_HEADER_FILES opencl_libsmm.h ${LIBSMM_ACC_HEADER_KERNELS}) -set(LIBSMM_ACC_FILES ${LIBSMM_ACC_SRC_FILES} ${LIBSMM_ACC_HEADER_FILES}) set(SMM_ACC_KERNEL_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/../acc_opencl.sh) set(SMM_ACC_KERNELS kernels/multiply.cl kernels/transpose.cl) list(TRANSFORM SMM_ACC_KERNELS PREPEND ${CMAKE_CURRENT_SOURCE_DIR}/) -if (USE_ACCEL MATCHES "opencl") - add_custom_command( - COMMAND - ${SMM_ACC_KERNEL_SCRIPT} ${SMM_ACC_KERNELS} - # parameter file is optional (does not need to exist) - ${CMAKE_CURRENT_SOURCE_DIR}/tune_multiply.csv ${LIBSMM_ACC_HEADER_KERNELS} - # param file is missing here (manual/clean rebuild needed) - DEPENDS ${SMM_ACC_KERNEL_SCRIPT} ${SMM_ACC_KERNELS} - OUTPUT ${LIBSMM_ACC_HEADER_KERNELS} - COMMENT "libsmm_acc: generating kernels") - add_library(libsmm_acc OBJECT ${LIBSMM_ACC_FILES}) - target_compile_definitions(libsmm_acc PRIVATE __OPENCL) - target_compile_definitions(libsmm_acc PRIVATE __LIBXSMM) - # account for DBCSR not calling libsmm_acc_init() (DBCSR only calls acc_init) - target_compile_definitions(libsmm_acc PRIVATE __DBCSR_ACC) - target_include_directories(libsmm_acc PRIVATE ${LIBXSMM_INCLUDE_DIRS}) - if (OpenMP_FOUND) - target_link_libraries(libsmm_acc PRIVATE OpenMP::OpenMP_C) - endif () -endif () +add_custom_target( + parameters ALL + COMMAND + ${SMM_ACC_KERNEL_SCRIPT} ${SMM_ACC_KERNELS} + ${CMAKE_CURRENT_SOURCE_DIR}/tune_multiply.csv ${LIBSMM_ACC_HEADER_KERNELS} + DEPENDS ${SMM_ACC_KERNEL_SCRIPT} ${SMM_ACC_KERNELS} + BYPRODUCTS ${LIBSMM_ACC_HEADER_KERNELS} + COMMENT "libsmm_acc: generating kernels") + +add_dependencies(dbcsr parameters) +target_include_directories(dbcsr PRIVATE ${CMAKE_CURRENT_BINARY_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/src/acc/opencl/smm/opencl_libsmm.c b/src/acc/opencl/smm/opencl_libsmm.c index e0a976adae2..f7878d1b1fe 100644 --- a/src/acc/opencl/smm/opencl_libsmm.c +++ b/src/acc/opencl/smm/opencl_libsmm.c @@ -142,7 +142,7 @@ int libsmm_acc_init(void) * However, DBCSR only calls acc_init (and expects an implicit libsmm_acc_init). */ if (EXIT_SUCCESS == result) { - result = acc_init(); + result = c_dbcsr_acc_init(); } #endif if (EXIT_SUCCESS == result) { @@ -262,7 +262,7 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, int nchar = ACC_OPENCL_SNPRINTF(fname, sizeof(fname), "xtrans%ix%i", m, n); if (0 < nchar && (int)sizeof(fname) > nchar) { cl_device_id active_device; - result = acc_opencl_device(stream, &active_device); + result = c_dbcsr_acc_opencl_device(stream, &active_device); if (EXIT_SUCCESS == result) { const char *const env_options = getenv("OPENCL_LIBSMM_TRANS_BUILDOPTS"); const char *const env_inplace = getenv("OPENCL_LIBSMM_TRANS_INPLACE"); @@ -298,7 +298,7 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, if ('\0' != *typename && 0 < nchar && (int)sizeof(build_options) > nchar) { opencl_libsmm_trans_t new_config; #if defined(OPENCL_LIBSMM_SOURCE_TRANSPOSE) - result = acc_opencl_kernel(OPENCL_LIBSMM_SOURCE_TRANSPOSE, + result = c_dbcsr_acc_opencl_kernel(OPENCL_LIBSMM_SOURCE_TRANSPOSE, build_options, fname, &new_config.kernel); #else ACC_OPENCL_UNUSED(inplace); @@ -306,7 +306,7 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, #endif if (EXIT_SUCCESS == result) { int max_wgsize; - result = acc_opencl_wgsize(active_device, new_config.kernel, + result = c_dbcsr_acc_opencl_wgsize(active_device, new_config.kernel, &max_wgsize, NULL/*preferred_multiple*/); if (EXIT_SUCCESS == result) { assert(0 < max_wgsize); @@ -345,9 +345,9 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, omat = (char*)libxsmm_aligned_scratch(data_size, 0/*auto-align*/); gold = (char*)libxsmm_aligned_scratch(mn * typesize, 0/*auto-align*/); if (NULL != imat && NULL != omat && NULL != gold) { - ACC_OPENCL_CHECK(acc_memcpy_d2h(dev_trs_stack, stack, sizeof(int) * offset_stack_size, stream), + ACC_OPENCL_CHECK(c_dbcsr_acc_memcpy_d2h(dev_trs_stack, stack, sizeof(int) * offset_stack_size, stream), "transfer debug stack", result); - ACC_OPENCL_CHECK(acc_memcpy_d2h(dev_data, imat, data_size, stream), + ACC_OPENCL_CHECK(c_dbcsr_acc_memcpy_d2h(dev_data, imat, data_size, stream), "transfer debug input", result); } else result = EXIT_FAILURE; @@ -371,11 +371,11 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, LIBXSMM_ATOMIC_RELEASE(lock, LIBXSMM_ATOMIC_RELAXED); } #if defined(OPENCL_LIBSMM_DEBUG_TRANS) - ACC_OPENCL_CHECK(acc_memcpy_d2h(dev_data, omat, data_size, stream), + ACC_OPENCL_CHECK(c_dbcsr_acc_memcpy_d2h(dev_data, omat, data_size, stream), "transfer debug test", result); #endif #if defined(OPENCL_LIBSMM_DEBUG_TRANS) || defined(OPENCL_LIBSMM_SYNC) - ACC_OPENCL_CHECK(acc_stream_sync(stream), "sync stream", result); + ACC_OPENCL_CHECK(c_dbcsr_acc_stream_sync(stream), "sync stream", result); #endif #if defined(OPENCL_LIBSMM_DEBUG_TRANS) if (EXIT_SUCCESS == result) { @@ -463,7 +463,7 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, const char* extensions = NULL; if (0 < nchar && (int)sizeof(fname) > nchar) { cl_device_id active_device; - result = acc_opencl_device(stream, &active_device); + result = c_dbcsr_acc_opencl_device(stream, &active_device); if (EXIT_SUCCESS == result) { const char *atomic_cmpxchg = NULL, *atomic_xchg = NULL; const char *atomic_type = NULL, *typename = NULL; @@ -471,7 +471,7 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, switch (datatype) { case dbcsr_type_real_8: { extensions = "cl_khr_fp64 cl_khr_int64_base_atomics"; - if (EXIT_SUCCESS == acc_opencl_device_ext(active_device, &extensions, 1)) { + if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, &extensions, 1)) { atomic_cmpxchg = "atom_cmpxchg"; atomic_xchg = "atom_xchg"; atomic_type = "long"; @@ -481,7 +481,7 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, } break; case dbcsr_type_real_4: { extensions = "cl_khr_global_int32_base_atomics"; - if (EXIT_SUCCESS == acc_opencl_device_ext(active_device, &extensions, 1)) { + if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, &extensions, 1)) { atomic_cmpxchg = "atomic_cmpxchg"; atomic_xchg = "atomic_xchg"; atomic_type = "int"; @@ -493,7 +493,7 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, } if (NULL != typename) { int max_wgsize, wgsize, bs, bm, bn, nbm, nbn; - result = acc_opencl_wgsize(active_device, NULL/*device-specific*/, + result = c_dbcsr_acc_opencl_wgsize(active_device, NULL/*device-specific*/, &max_wgsize, NULL/*preferred_multiple*/); if (EXIT_SUCCESS == result) { const char *const env_batchsize = getenv("OPENCL_LIBSMM_SMM_BATCHSIZE"); @@ -528,8 +528,8 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, const char *const env_atomics = getenv("OPENCL_LIBSMM_SMM_ATOMICS"); const char *atomics = NULL; if (NULL == env_atomics || '0' != *env_atomics) { - if ((NULL == env_atomics && EXIT_SUCCESS != acc_opencl_device_vendor(active_device, "nvidia")) - || NULL != acc_opencl_stristr(env_atomics, "cmpxchg")) + if ((NULL == env_atomics && EXIT_SUCCESS != c_dbcsr_acc_opencl_device_vendor(active_device, "nvidia")) + || NULL != c_dbcsr_acc_opencl_stristr(env_atomics, "cmpxchg")) { atomics = "atomic_add_global_cmpxchg(A,B)"; } @@ -544,7 +544,7 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, nchar = ACC_OPENCL_SNPRINTF(build_options, sizeof(build_options), "%s -cl-fast-relaxed-math -cl-no-signed-zeros -cl-denorms-are-zero" " -DGLOBAL=%s -DFN=%s -DSM=%i -DSN=%i -DSK=%i -DBM=%i -DBN=%i -DBS=%i" - " -DT=%s -DTA=\"%s\" -DFMA=fma -DCMPXCHG=%s -DXCHG=%s" + " -DT=%s -DTA=%s -DFMA=fma -DCMPXCHG=%s -DXCHG=%s" " -D\"ATOMIC_ADD_GLOBAL(A,B)=%s\"", (NULL == env_options || '\0' == *env_options) ? "" : env_options, EXIT_SUCCESS != opencl_libsmm_use_cmem(active_device) ? "global" : "constant", @@ -560,13 +560,13 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, if (EXIT_SUCCESS == result) { opencl_libsmm_smm_t new_config; #if defined(OPENCL_LIBSMM_SOURCE_MULTIPLY) - result = acc_opencl_kernel(OPENCL_LIBSMM_SOURCE_MULTIPLY, + result = c_dbcsr_acc_opencl_kernel(OPENCL_LIBSMM_SOURCE_MULTIPLY, build_options, fname, &new_config.kernel); #else result = EXIT_FAILURE; #endif if (EXIT_SUCCESS == result) { - result = acc_opencl_wgsize(active_device, new_config.kernel, + result = c_dbcsr_acc_opencl_wgsize(active_device, new_config.kernel, &max_wgsize, NULL/*preferred_multiple*/); if (EXIT_SUCCESS == result) { assert(0 < wgsize && 0 < max_wgsize); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index c1c47840e95..1d32a079811 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -15,16 +15,11 @@ endif () # =================================== DBCSR PERF TESTS set(DBCSR_PERF_SRCS dbcsr_performance_driver.F dbcsr_performance_multiply.F) -if (USE_ACCEL MATCHES "hip") - hip_add_executable(dbcsr_perf ${DBCSR_PERF_SRCS}) -else () - add_executable(dbcsr_perf ${DBCSR_PERF_SRCS}) -endif () +add_executable(dbcsr_perf ${DBCSR_PERF_SRCS}) target_link_libraries(dbcsr_perf dbcsr) set_target_properties(dbcsr_perf PROPERTIES LINKER_LANGUAGE Fortran) -if (OpenMP_FOUND) - target_link_libraries(dbcsr_perf OpenMP::OpenMP_Fortran) -endif () +target_link_libraries(dbcsr_perf + $<$:OpenMP::OpenMP_Fortran>) file( GLOB DBCSR_PERF_TESTS @@ -66,11 +61,8 @@ set(DBCSR_TESTS_FTN dbcsr_tas_unittest dbcsr_test_csr_conversions) -if (NOT USE_ACCEL STREQUAL "") - # TODO: enable dbcsr_acc_test for HIP - if (NOT USE_ACCEL MATCHES "hip") - set(DBCSR_TESTS_C dbcsr_acc_test_c) - endif () +if (USE_ACCEL) + set(DBCSR_TESTS_C dbcsr_acc_test_c) endif () if (NOT (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")) @@ -114,11 +106,7 @@ target_link_libraries(dbcsr_unittest_common PUBLIC dbcsr) # Compile Fortran tests foreach (dbcsr_test ${DBCSR_TESTS_FTN}) - if (USE_ACCEL MATCHES "hip") - hip_add_executable(${dbcsr_test} ${${dbcsr_test}_SRCS}) - else () - add_executable(${dbcsr_test} ${${dbcsr_test}_SRCS}) - endif () + add_executable(${dbcsr_test} ${${dbcsr_test}_SRCS}) target_link_libraries(${dbcsr_test} dbcsr_unittest_common) set_target_properties(${dbcsr_test} PROPERTIES LINKER_LANGUAGE Fortran) # register unittest executable with CMake @@ -172,12 +160,8 @@ if (WITH_C_API) endforeach () foreach (dbcsr_test_c ${DBCSR_TESTS_C}) - if (USE_ACCEL MATCHES "hip") - hip_add_executable(${dbcsr_test_c} ${${dbcsr_test_c}_SRCS}) - else () - add_executable(${dbcsr_test_c} ${${dbcsr_test_c}_SRCS}) - endif () - target_link_libraries(${dbcsr_test_c} dbcsr_c) + add_executable(${dbcsr_test_c} ${${dbcsr_test_c}_SRCS}) + target_link_libraries(${dbcsr_test_c} dbcsr_c dbcsr) # register unittest executable with CMake if (USE_MPI) separate_arguments(MPIEXEC_PREFLAGS) @@ -204,8 +188,8 @@ file(RELATIVE_PATH CURRENT_BINARY_DIR_RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}) # libsmm_acc_unittest_multiply -add_custom_command( - OUTPUT libsmm_acc_unittest_multiply.cpp +add_custom_target( + generate_libsmm_acc_unittest_multiply_test_cpp COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate_libsmm_acc_unittest_multiply.py @@ -213,13 +197,12 @@ add_custom_command( ${CURRENT_BINARY_DIR_RELATIVE} --gpu_version=${WITH_GPU} DEPENDS libsmm_acc_unittest_multiply.cpp.template generate_libsmm_acc_unittest_multiply.py + BYPRODUCTS libsmm_acc_unittest_multiply.cpp COMMENT "Generate tests/libsmm_acc_unittest_multiply.cpp") -add_custom_target(generate_libsmm_acc_unittest_multiply_test_cpp - DEPENDS libsmm_acc_unittest_multiply.cpp) # libsmm_acc_timer_multiply -add_custom_command( - OUTPUT libsmm_acc_timer_multiply.cpp +add_custom_target( + generate_libsmm_acc_timer_multiply_test_cpp COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate_libsmm_acc_timer_multiply.py --base_dir @@ -227,9 +210,8 @@ add_custom_command( --gpu_version=${WITH_GPU} DEPENDS libsmm_acc_timer_multiply.cpp.template generate_libsmm_acc_timer_multiply.py - COMMENT "Generate tests/libsmm_acc_unittest_transpose.cpp") -add_custom_target(generate_libsmm_acc_timer_multiply_test_cpp - DEPENDS libsmm_acc_timer_multiply.cpp) + BYPRODUCTS libsmm_acc_timer_multiply.cpp + COMMENT "Generate tests/generate_libsmm_acc_timer_multiply.cpp") if (USE_ACCEL MATCHES "cuda|hip") @@ -239,73 +221,21 @@ if (USE_ACCEL MATCHES "cuda|hip") ${CMAKE_CURRENT_BINARY_DIR}/libsmm_acc_timer_multiply.cpp libsmm_acc_unittest_transpose.cpp) - # Tests that need no additional arguments to be run - set(LIBSMM_ACC_NOARG_TESTS libsmm_acc_unittest_multiply - libsmm_acc_unittest_transpose) - # Add executables for all libsmm_acc tests - if (USE_ACCEL MATCHES "cuda") - - foreach (libsmm_acc_test ${LIBSMM_ACC_TESTS_SRCS}) - - get_filename_component(libsmm_acc_test_name ${libsmm_acc_test} NAME_WE) - - add_executable(${libsmm_acc_test_name} ${libsmm_acc_test}) - target_compile_definitions(${libsmm_acc_test_name} PRIVATE __CUDA) - target_include_directories( - ${libsmm_acc_test_name} - PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) - - target_link_libraries(${libsmm_acc_test_name} dbcsr) - - if (OpenMP_FOUND) - target_link_libraries(${libsmm_acc_test_name} OpenMP::OpenMP_CXX) - endif () - - endforeach () - - elseif (USE_ACCEL MATCHES "hip") - - foreach (libsmm_acc_test ${LIBSMM_ACC_TESTS_SRCS}) - set_source_files_properties(${libsmm_acc_test} - PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) - - get_filename_component(libsmm_acc_test_name ${libsmm_acc_test} NAME_WE) - - hip_add_executable(${libsmm_acc_test_name} ${libsmm_acc_test}) - target_link_options(${libsmm_acc_test_name} PRIVATE ${HIP_ARCH_FLAGS}) - - target_link_libraries(${libsmm_acc_test_name} dbcsr) - target_compile_definitions(${libsmm_acc_test_name} PRIVATE __HIP) - - endforeach () - - # Workaround issue in hip_add_library: explicitely write dependency between - # the test executable and the generated test c++ source file - add_dependencies(libsmm_acc_unittest_multiply - generate_libsmm_acc_unittest_multiply_test_cpp) - add_dependencies(libsmm_acc_timer_multiply - generate_libsmm_acc_timer_multiply_test_cpp) - - foreach (libsmm_acc_test ${LIBSMM_ACC_TESTS_SRCS}) - - get_filename_component(libsmm_acc_test_name ${libsmm_acc_test} NAME_WE) - target_compile_definitions(${libsmm_acc_test_name} PRIVATE __HIP) - target_include_directories(${libsmm_acc_test_name} - PRIVATE ${HIP_PATH}/../include) - - target_link_libraries(${libsmm_acc_test_name} dbcsr) - - endforeach () - - endif () - - # Add tests that do not need additional arguments - foreach (libsmm_acc_test ${LIBSMM_ACC_NOARG_TESTS}) - add_test(NAME ${libsmm_acc_test} COMMAND ${libsmm_acc_test}) + foreach (libsmm_acc_test ${LIBSMM_ACC_TESTS_SRCS}) + get_filename_component(libsmm_acc_test_name ${libsmm_acc_test} NAME_WE) + add_executable(${libsmm_acc_test_name} ${libsmm_acc_test}) + target_compile_definitions( + ${libsmm_acc_test_name} PRIVATE $<$:__HIP> + $<$:__CUDA>) + target_link_libraries(${libsmm_acc_test_name} dbcsr + $<$:hip::host> libsmm_acc) endforeach () - # Add tests needing additional arguments: + add_test(NAME libsmm_acc_unittest_multiply + COMMAND libsmm_acc_unittest_multiply) + add_test(NAME libsmm_acc_unittest_transpose + COMMAND libsmm_acc_unittest_transpose) add_test(NAME libsmm_acc_timer_multiply-autotuned COMMAND libsmm_acc_timer_multiply autotuned) add_test(NAME libsmm_acc_timer_multiply-predicted diff --git a/tests/dbcsr_acc_test.c b/tests/dbcsr_acc_test.c index 81eb8472cb9..a0cd1c3e327 100644 --- a/tests/dbcsr_acc_test.c +++ b/tests/dbcsr_acc_test.c @@ -6,7 +6,7 @@ * For further information please visit https://dbcsr.cp2k.org * * SPDX-License-Identifier: GPL-2.0+ * *------------------------------------------------------------------------------------------------*/ -#include "../src/acc/acc.h" +#include "acc/acc.h" #include #include #include @@ -75,20 +75,20 @@ int main(int argc, char* argv[]) randnums[i] = rand(); } - ACC_CHECK(acc_init()); - ACC_CHECK(acc_get_ndevices(&ndevices)); + ACC_CHECK(c_dbcsr_acc_init()); + ACC_CHECK(c_dbcsr_acc_get_ndevices(&ndevices)); PRINTF("ndevices: %i\n", ndevices); /* continue tests even with no device */ if (0 <= device && device < ndevices) { /* not an error */ - ACC_CHECK(acc_set_active_device(device)); + ACC_CHECK(c_dbcsr_acc_set_active_device(device)); } - ACC_CHECK(acc_dev_mem_info(&mem_free, &mem_total)); + ACC_CHECK(c_dbcsr_acc_dev_mem_info(&mem_free, &mem_total)); ACC_CHECK(mem_free <= mem_total ? EXIT_SUCCESS : EXIT_FAILURE); PRINTF("device memory: free=%i MB total=%i MB\n", (int)(mem_free >> 20), (int)(mem_total >> 20)); - ACC_CHECK(acc_stream_priority_range(&priomin, &priomax)); + ACC_CHECK(c_dbcsr_acc_stream_priority_range(&priomin, &priomax)); priospan = 1 + priomin - priomax; PRINTF("stream priority: lowest=%i highest=%i%s\n", priomin, priomax, 0 < priospan ? "" : " <-- WARNING: inconsistent values"); @@ -102,13 +102,13 @@ int main(int argc, char* argv[]) } /* create stream with NULL-name and low priority */ - ACC_CHECK(acc_stream_create(&s, NULL/*name*/, priomin)); - ACC_CHECK(acc_stream_destroy(s)); + ACC_CHECK(c_dbcsr_acc_stream_create(&s, NULL/*name*/, priomin)); + ACC_CHECK(c_dbcsr_acc_stream_destroy(s)); /* create stream with empty name and medium priority */ - ACC_CHECK(acc_stream_create(&s, "", (priomin + priomax) / 2)); - ACC_CHECK(acc_stream_destroy(s)); + ACC_CHECK(c_dbcsr_acc_stream_create(&s, "", (priomin + priomax) / 2)); + ACC_CHECK(c_dbcsr_acc_stream_destroy(s)); /* destroying NULL-stream shall be valid (just like delete/free) */ - ACC_CHECK(acc_stream_destroy(NULL)); + ACC_CHECK(c_dbcsr_acc_stream_destroy(NULL)); #if defined(_OPENMP) # pragma omp parallel for num_threads(nthreads) private(i) @@ -118,10 +118,10 @@ int main(int argc, char* argv[]) char name[ACC_STRING_MAXLEN]; /* thread-local */ const int n = sprintf(name, "%i", i); ACC_CHECK((0 <= n && n < ACC_STRING_MAXLEN) ? EXIT_SUCCESS : EXIT_FAILURE); - ACC_CHECK(acc_stream_create(stream + i, name, priority[i])); + ACC_CHECK(c_dbcsr_acc_stream_create(stream + i, name, priority[i])); if (ACC_STREAM_MAXNTH_DESTROY * r < ACC_STREAM_MAXCOUNT) { void *const si = stream[i]; stream[i] = NULL; - ACC_CHECK(acc_stream_destroy(si)); + ACC_CHECK(c_dbcsr_acc_stream_destroy(si)); } } @@ -133,21 +133,21 @@ int main(int argc, char* argv[]) char name[ACC_STRING_MAXLEN]; /* thread-local */ const int n = sprintf(name, "%i", i); ACC_CHECK((0 <= n && n < ACC_STRING_MAXLEN) ? EXIT_SUCCESS : EXIT_FAILURE); - ACC_CHECK(acc_stream_create(stream + i, name, priority[i])); + ACC_CHECK(c_dbcsr_acc_stream_create(stream + i, name, priority[i])); } - ACC_CHECK(acc_stream_destroy(stream[i])); + ACC_CHECK(c_dbcsr_acc_stream_destroy(stream[i])); } - ACC_CHECK(acc_event_destroy(NULL)); + ACC_CHECK(c_dbcsr_acc_event_destroy(NULL)); #if defined(_OPENMP) # pragma omp parallel for num_threads(nthreads) private(i) #endif for (i = 0; i < ACC_EVENT_MAXCOUNT; ++i) { const int r = randnums[i] % ACC_EVENT_MAXCOUNT; - ACC_CHECK(acc_event_create(event + i)); + ACC_CHECK(c_dbcsr_acc_event_create(event + i)); if (ACC_EVENT_MAXNTH_DESTROY * r < ACC_EVENT_MAXCOUNT) { void *const ei = event[i]; event[i] = NULL; - ACC_CHECK(acc_event_destroy(ei)); + ACC_CHECK(c_dbcsr_acc_event_destroy(ei)); } } @@ -156,25 +156,25 @@ int main(int argc, char* argv[]) #endif for (i = 0; i < ACC_EVENT_MAXCOUNT; ++i) { if (NULL == event[i]) { - ACC_CHECK(acc_event_create(event + i)); + ACC_CHECK(c_dbcsr_acc_event_create(event + i)); } - ACC_CHECK(acc_event_destroy(event[i])); + ACC_CHECK(c_dbcsr_acc_event_destroy(event[i])); } #if defined(_OPENMP) # pragma omp parallel for num_threads(nthreads) private(i) #endif - for (i = 0; i < ACC_EVENT_MAXCOUNT; ++i) ACC_CHECK(acc_event_create(event + i)); + for (i = 0; i < ACC_EVENT_MAXCOUNT; ++i) ACC_CHECK(c_dbcsr_acc_event_create(event + i)); for (i = 0; i < ACC_EVENT_MAXCOUNT; ++i) { acc_bool_t has_occurred = 0; - ACC_CHECK(acc_event_query(event[i], &has_occurred)); + ACC_CHECK(c_dbcsr_acc_event_query(event[i], &has_occurred)); ACC_CHECK(has_occurred ? EXIT_SUCCESS : EXIT_FAILURE); } - ACC_CHECK(acc_stream_create(&s, "stream", priomax)); - ACC_CHECK(acc_host_mem_allocate(&host_mem, mem_alloc, s)); - ACC_CHECK(acc_dev_mem_allocate(&dev_mem, mem_alloc)); - ACC_CHECK(acc_stream_sync(s)); /* wait for completion */ + ACC_CHECK(c_dbcsr_acc_stream_create(&s, "stream", priomax)); + ACC_CHECK(c_dbcsr_acc_host_mem_allocate(&host_mem, mem_alloc, s)); + ACC_CHECK(c_dbcsr_acc_dev_mem_allocate(&dev_mem, mem_alloc)); + ACC_CHECK(c_dbcsr_acc_stream_sync(s)); /* wait for completion */ memset(host_mem, 0xFF, mem_alloc); /* non-zero pattern */ nt = (nthreads < ACC_EVENT_MAXCOUNT ? nthreads : ACC_EVENT_MAXCOUNT); @@ -191,16 +191,16 @@ int main(int argc, char* argv[]) const size_t offset = tid * mem_chunk, mem_rest = mem_alloc - offset; const size_t size = (mem_chunk <= mem_rest ? mem_chunk : mem_rest); acc_bool_t has_occurred = 0; - ACC_CHECK(acc_memset_zero(dev_mem, offset, size, s)); + ACC_CHECK(c_dbcsr_acc_memset_zero(dev_mem, offset, size, s)); /* can enqueue multiple/duplicate copies for the same memory region */ - ACC_CHECK(acc_memcpy_d2h(dev_mem, host_mem, mem_alloc, s)); - ACC_CHECK(acc_event_query(event[tid], &has_occurred)); + ACC_CHECK(c_dbcsr_acc_memcpy_d2h(dev_mem, host_mem, mem_alloc, s)); + ACC_CHECK(c_dbcsr_acc_event_query(event[tid], &has_occurred)); /* unrecorded event has no work to wait for, hence it occurred */ ACC_CHECK(has_occurred ? EXIT_SUCCESS : EXIT_FAILURE); - ACC_CHECK(acc_event_record(event[tid], s)); - ACC_CHECK(acc_stream_wait_event(s, event[tid])); - ACC_CHECK(acc_event_synchronize(event[tid])); - ACC_CHECK(acc_event_query(event[tid], &has_occurred)); + ACC_CHECK(c_dbcsr_acc_event_record(event[tid], s)); + ACC_CHECK(c_dbcsr_acc_stream_wait_event(s, event[tid])); + ACC_CHECK(c_dbcsr_acc_event_synchronize(event[tid])); + ACC_CHECK(c_dbcsr_acc_event_query(event[tid], &has_occurred)); ACC_CHECK(has_occurred ? EXIT_SUCCESS : EXIT_FAILURE); } @@ -208,17 +208,17 @@ int main(int argc, char* argv[]) for (i = (int)(mem_alloc - 1); 0 <= i; --i) { ACC_CHECK(0 == ((char*)host_mem)[i] ? EXIT_SUCCESS : EXIT_FAILURE); } - ACC_CHECK(acc_dev_mem_deallocate(dev_mem)); - ACC_CHECK(acc_host_mem_deallocate(host_mem, s)); - ACC_CHECK(acc_stream_destroy(s)); + ACC_CHECK(c_dbcsr_acc_dev_mem_deallocate(dev_mem)); + ACC_CHECK(c_dbcsr_acc_host_mem_deallocate(host_mem, s)); + ACC_CHECK(c_dbcsr_acc_stream_destroy(s)); #if defined(_OPENMP) # pragma omp parallel for num_threads(nthreads) private(i) #endif - for (i = 0; i < ACC_EVENT_MAXCOUNT; ++i) ACC_CHECK(acc_event_destroy(event[i])); + for (i = 0; i < ACC_EVENT_MAXCOUNT; ++i) ACC_CHECK(c_dbcsr_acc_event_destroy(event[i])); - acc_clear_errors(); /* no result code */ - ACC_CHECK(acc_finalize()); + c_dbcsr_acc_clear_errors(); /* no result code */ + ACC_CHECK(c_dbcsr_acc_finalize()); return EXIT_SUCCESS; } diff --git a/tests/libsmm_acc_timer_multiply.cpp.template b/tests/libsmm_acc_timer_multiply.cpp.template index 27fdc8b727d..3089c9d1432 100644 --- a/tests/libsmm_acc_timer_multiply.cpp.template +++ b/tests/libsmm_acc_timer_multiply.cpp.template @@ -12,9 +12,8 @@ #include #include #include -#include "../src/acc/libsmm_acc/libsmm_acc_benchmark.h" -#include "../src/acc/libsmm_acc/libsmm_acc.h" -#include "../src/acc/libsmm_acc/parameters.h" +#include "libsmm_acc_benchmark.h" +#include "libsmm_acc.h" std::vector combinations(std::vector to_combine){ diff --git a/tests/libsmm_acc_unittest_multiply.cpp.template b/tests/libsmm_acc_unittest_multiply.cpp.template index 98475108435..8d97e0ff857 100644 --- a/tests/libsmm_acc_unittest_multiply.cpp.template +++ b/tests/libsmm_acc_unittest_multiply.cpp.template @@ -11,9 +11,8 @@ #include #include #include -#include "../src/acc/libsmm_acc/libsmm_acc_benchmark.h" -#include "../src/acc/libsmm_acc/libsmm_acc.h" -#include "../src/acc/libsmm_acc/parameters.h" +#include "libsmm_acc_benchmark.h" +#include "libsmm_acc.h" /****************************************************************************\ diff --git a/tests/libsmm_acc_unittest_transpose.cpp b/tests/libsmm_acc_unittest_transpose.cpp index f4518aa8601..b0444443407 100644 --- a/tests/libsmm_acc_unittest_transpose.cpp +++ b/tests/libsmm_acc_unittest_transpose.cpp @@ -13,9 +13,9 @@ #include #include #include -#include "../src/acc/libsmm_acc/libsmm_acc_benchmark.h" -#include "../src/acc/libsmm_acc/libsmm_acc.h" -#include "../src/acc/libsmm_acc/parameters.h" +#include "libsmm_acc_benchmark.h" +#include "libsmm_acc.h" +#include "parameters.h" /****************************************************************************\