clean up CMakeLists.txt

shiyu1994 · shiyu1994 · commit 3cd34a2e4c55 · 2025-02-26T03:01:53.000Z
use WARPSIZE
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -267,6 +267,34 @@ if(USE_CUDA)
     endif()
 endif()
 
+if(USE_ROCM)
+    find_package(HIP)
+    include_directories(${HIP_INCLUDE_DIRS})
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_AMD__")
+    set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} ${OpenMP_CXX_FLAGS} -fPIC -Wall")
+
+    # avoid warning: unused variable 'mask' due to __shfl_down_sync work-around
+    set(DISABLED_WARNINGS "${DISABLED_WARNINGS} -Wno-unused-variable")
+    # avoid warning: 'hipHostAlloc' is deprecated: use hipHostMalloc instead
+    set(DISABLED_WARNINGS "${DISABLED_WARNINGS} -Wno-deprecated-declarations")
+    # avoid many warnings about missing overrides
+    set(DISABLED_WARNINGS "${DISABLED_WARNINGS} -Wno-inconsistent-missing-override")
+    # avoid warning: shift count >= width of type in feature_histogram.hpp
+    set(DISABLED_WARNINGS "${DISABLED_WARNINGS} -Wno-shift-count-overflow")
+
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${DISABLED_WARNINGS}")
+    set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} ${DISABLED_WARNINGS}")
+
+    if(USE_DEBUG)
+      set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -g -O0")
+    else()
+      set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -O3")
+    endif()
+    message(STATUS "CMAKE_HIP_FLAGS: ${CMAKE_HIP_FLAGS}")
+
+    add_definitions(-DUSE_ROCM)
+endif()
+
 include(CheckCXXSourceCompiles)
 check_cxx_source_compiles("
 #include <xmmintrin.h>
diff --git a/src/objective/cuda/cuda_rank_objective.cu b/src/objective/cuda/cuda_rank_objective.cu
@@ -409,7 +409,7 @@ __global__ void GetGradientsKernel_RankXENDCG_SharedMemory(
     const data_size_t block_reduce_size = query_item_count >= 1024 ? 1024 : query_item_count;
     __shared__ double shared_rho[SHARED_MEMORY_SIZE];
     // assert that warpSize == 32
-    __shared__ double shared_buffer[32];
+    __shared__ double shared_buffer[WARPSIZE];
     __shared__ double shared_params[SHARED_MEMORY_SIZE];
     __shared__ score_t shared_lambdas[SHARED_MEMORY_SIZE];
     __shared__ double reduce_result;
@@ -527,7 +527,7 @@ __global__ void GetGradientsKernel_RankXENDCG_GlobalMemory(
     double* cuda_params_buffer_pointer = cuda_params_buffer + item_index_start;
     const data_size_t block_reduce_size = query_item_count > 1024 ? 1024 : query_item_count;
     // assert that warpSize == 32, so we use buffer size 1024 / 32 = 32
-    __shared__ double shared_buffer[32];
+    __shared__ double shared_buffer[WARPSIZE];
     __shared__ double reduce_result;
     if (query_item_count <= 1) {
       for (data_size_t i = 0; i <= query_item_count; ++i) {
diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu
@@ -364,9 +364,9 @@ __device__ void FindBestSplitsDiscretizedForLeafKernelInner(
     }
   }
   __shared__ uint32_t best_thread_index;
-  __shared__ double shared_double_buffer[32];
-  __shared__ bool shared_bool_buffer[32];
-  __shared__ uint32_t shared_int_buffer[64];
+  __shared__ double shared_double_buffer[WARPSIZE];
+  __shared__ bool shared_bool_buffer[WARPSIZE];
+  __shared__ uint32_t shared_int_buffer[2 * WARPSIZE];  // need 2 * WARPSIZE since the actual ACC_HIST_TYPE could be long int
   const unsigned int threadIdx_x = threadIdx.x;
   const bool skip_sum = REVERSE ?
     (task->skip_default_bin && (task->num_bin - 1 - threadIdx_x) == static_cast<int>(task->default_bin)) :
diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu
@@ -1080,7 +1080,7 @@ __global__ void RenewDiscretizedTreeLeavesKernel(
   double* leaf_grad_stat_buffer,
   double* leaf_hess_stat_buffer,
   double* leaf_values) {
-  __shared__ double shared_mem_buffer[32];
+  __shared__ double shared_mem_buffer[WARPSIZE];
   const int leaf_index = static_cast<int>(blockIdx.x);
   const data_size_t* data_indices_in_leaf = data_indices + leaf_data_start[leaf_index];
   const data_size_t num_data_in_leaf = leaf_num_data[leaf_index];
diff --git a/src/treelearner/cuda/cuda_gradient_discretizer.cu b/src/treelearner/cuda/cuda_gradient_discretizer.cu
@@ -22,7 +22,7 @@ __global__ void ReduceMinMaxKernel(
   score_t* grad_max_block_buffer,
   score_t* hess_min_block_buffer,
   score_t* hess_max_block_buffer) {
-  __shared__ score_t shared_mem_buffer[32];
+  __shared__ score_t shared_mem_buffer[WARPSIZE];
   const data_size_t index = static_cast<data_size_t>(threadIdx.x + blockIdx.x * blockDim.x);
   score_t grad_max_val = kMinScore;
   score_t grad_min_val = kMaxScore;
@@ -56,7 +56,7 @@ __global__ void ReduceBlockMinMaxKernel(
   score_t* grad_max_block_buffer,
   score_t* hess_min_block_buffer,
   score_t* hess_max_block_buffer) {
-  __shared__ score_t shared_mem_buffer[32];
+  __shared__ score_t shared_mem_buffer[WARPSIZE];
   score_t grad_max_val = kMinScore;
   score_t grad_min_val = kMaxScore;
   score_t hess_max_val = kMinScore;
diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu
@@ -835,7 +835,7 @@ __global__ void FixHistogramDiscretizedKernel(
   const int* cuda_need_fix_histogram_features,
   const uint32_t* cuda_need_fix_histogram_features_num_bin_aligned,
   const CUDALeafSplitsStruct* cuda_smaller_leaf_splits) {
-  __shared__ int64_t shared_mem_buffer[32];
+  __shared__ int64_t shared_mem_buffer[WARPSIZE];
   const unsigned int blockIdx_x = blockIdx.x;
   const int feature_index = cuda_need_fix_histogram_features[blockIdx_x];
   const uint32_t num_bin_aligned = cuda_need_fix_histogram_features_num_bin_aligned[blockIdx_x];
diff --git a/src/treelearner/cuda/cuda_leaf_splits.cu b/src/treelearner/cuda/cuda_leaf_splits.cu
@@ -90,7 +90,7 @@ __global__ void CUDAInitValuesKernel3(const int16_t* cuda_gradients_and_hessians
   const score_t* grad_scale_pointer, const score_t* hess_scale_pointer) {
   const score_t grad_scale = *grad_scale_pointer;
   const score_t hess_scale = *hess_scale_pointer;
-  __shared__ int64_t shared_mem_buffer[32];
+  __shared__ int64_t shared_mem_buffer[WARPSIZE];
   const data_size_t data_index = static_cast<data_size_t>(threadIdx.x + blockIdx.x * blockDim.x);
   int64_t int_gradient = 0;
   int64_t int_hessian = 0;
@@ -121,7 +121,7 @@ __global__ void CUDAInitValuesKernel4(
   const data_size_t* cuda_data_indices_in_leaf,
   hist_t* cuda_hist_in_leaf,
   CUDALeafSplitsStruct* cuda_struct) {
-  __shared__ double shared_mem_buffer[32];
+  __shared__ double shared_mem_buffer[WARPSIZE];
   double thread_sum_of_gradients = 0.0f;
   double thread_sum_of_hessians = 0.0f;
   int64_t thread_sum_of_gradients_hessians = 0;

Original file line number	Diff line number	Diff line change
`@@ -364,9 +364,9 @@ __device__ void FindBestSplitsDiscretizedForLeafKernelInner(`
`364`	`364`	`}`
`365`	`365`	`}`
`366`	`366`	`__shared__ uint32_t best_thread_index;`
`367`		`- __shared__ double shared_double_buffer[32];`
`368`		`- __shared__ bool shared_bool_buffer[32];`
`369`		`- __shared__ uint32_t shared_int_buffer[64];`
	`367`	`+ __shared__ double shared_double_buffer[WARPSIZE];`
	`368`	`+ __shared__ bool shared_bool_buffer[WARPSIZE];`
	`369`	`+ __shared__ uint32_t shared_int_buffer[2 * WARPSIZE]; // need 2 * WARPSIZE since the actual ACC_HIST_TYPE could be long int`
`370`	`370`	`const unsigned int threadIdx_x = threadIdx.x;`
`371`	`371`	`const bool skip_sum = REVERSE ?`
`372`	`372`	`(task->skip_default_bin && (task->num_bin - 1 - threadIdx_x) == static_cast<int>(task->default_bin)) :`