Skip to content

Commit 3cd34a2

Browse files
committed
clean up CMakeLists.txt
use WARPSIZE
1 parent f4c605d commit 3cd34a2

File tree

7 files changed

+39
-11
lines changed

7 files changed

+39
-11
lines changed

CMakeLists.txt

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,34 @@ if(USE_CUDA)
267267
endif()
268268
endif()
269269

270+
if(USE_ROCM)
271+
find_package(HIP)
272+
include_directories(${HIP_INCLUDE_DIRS})
273+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_AMD__")
274+
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} ${OpenMP_CXX_FLAGS} -fPIC -Wall")
275+
276+
# avoid warning: unused variable 'mask' due to __shfl_down_sync work-around
277+
set(DISABLED_WARNINGS "${DISABLED_WARNINGS} -Wno-unused-variable")
278+
# avoid warning: 'hipHostAlloc' is deprecated: use hipHostMalloc instead
279+
set(DISABLED_WARNINGS "${DISABLED_WARNINGS} -Wno-deprecated-declarations")
280+
# avoid many warnings about missing overrides
281+
set(DISABLED_WARNINGS "${DISABLED_WARNINGS} -Wno-inconsistent-missing-override")
282+
# avoid warning: shift count >= width of type in feature_histogram.hpp
283+
set(DISABLED_WARNINGS "${DISABLED_WARNINGS} -Wno-shift-count-overflow")
284+
285+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${DISABLED_WARNINGS}")
286+
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} ${DISABLED_WARNINGS}")
287+
288+
if(USE_DEBUG)
289+
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -g -O0")
290+
else()
291+
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -O3")
292+
endif()
293+
message(STATUS "CMAKE_HIP_FLAGS: ${CMAKE_HIP_FLAGS}")
294+
295+
add_definitions(-DUSE_ROCM)
296+
endif()
297+
270298
include(CheckCXXSourceCompiles)
271299
check_cxx_source_compiles("
272300
#include <xmmintrin.h>

src/objective/cuda/cuda_rank_objective.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -409,7 +409,7 @@ __global__ void GetGradientsKernel_RankXENDCG_SharedMemory(
409409
const data_size_t block_reduce_size = query_item_count >= 1024 ? 1024 : query_item_count;
410410
__shared__ double shared_rho[SHARED_MEMORY_SIZE];
411411
// assert that warpSize == 32
412-
__shared__ double shared_buffer[32];
412+
__shared__ double shared_buffer[WARPSIZE];
413413
__shared__ double shared_params[SHARED_MEMORY_SIZE];
414414
__shared__ score_t shared_lambdas[SHARED_MEMORY_SIZE];
415415
__shared__ double reduce_result;
@@ -527,7 +527,7 @@ __global__ void GetGradientsKernel_RankXENDCG_GlobalMemory(
527527
double* cuda_params_buffer_pointer = cuda_params_buffer + item_index_start;
528528
const data_size_t block_reduce_size = query_item_count > 1024 ? 1024 : query_item_count;
529529
// assert that warpSize == 32, so we use buffer size 1024 / 32 = 32
530-
__shared__ double shared_buffer[32];
530+
__shared__ double shared_buffer[WARPSIZE];
531531
__shared__ double reduce_result;
532532
if (query_item_count <= 1) {
533533
for (data_size_t i = 0; i <= query_item_count; ++i) {

src/treelearner/cuda/cuda_best_split_finder.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -364,9 +364,9 @@ __device__ void FindBestSplitsDiscretizedForLeafKernelInner(
364364
}
365365
}
366366
__shared__ uint32_t best_thread_index;
367-
__shared__ double shared_double_buffer[32];
368-
__shared__ bool shared_bool_buffer[32];
369-
__shared__ uint32_t shared_int_buffer[64];
367+
__shared__ double shared_double_buffer[WARPSIZE];
368+
__shared__ bool shared_bool_buffer[WARPSIZE];
369+
__shared__ uint32_t shared_int_buffer[2 * WARPSIZE]; // need 2 * WARPSIZE since the actual ACC_HIST_TYPE could be long int
370370
const unsigned int threadIdx_x = threadIdx.x;
371371
const bool skip_sum = REVERSE ?
372372
(task->skip_default_bin && (task->num_bin - 1 - threadIdx_x) == static_cast<int>(task->default_bin)) :

src/treelearner/cuda/cuda_data_partition.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1080,7 +1080,7 @@ __global__ void RenewDiscretizedTreeLeavesKernel(
10801080
double* leaf_grad_stat_buffer,
10811081
double* leaf_hess_stat_buffer,
10821082
double* leaf_values) {
1083-
__shared__ double shared_mem_buffer[32];
1083+
__shared__ double shared_mem_buffer[WARPSIZE];
10841084
const int leaf_index = static_cast<int>(blockIdx.x);
10851085
const data_size_t* data_indices_in_leaf = data_indices + leaf_data_start[leaf_index];
10861086
const data_size_t num_data_in_leaf = leaf_num_data[leaf_index];

src/treelearner/cuda/cuda_gradient_discretizer.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ __global__ void ReduceMinMaxKernel(
2222
score_t* grad_max_block_buffer,
2323
score_t* hess_min_block_buffer,
2424
score_t* hess_max_block_buffer) {
25-
__shared__ score_t shared_mem_buffer[32];
25+
__shared__ score_t shared_mem_buffer[WARPSIZE];
2626
const data_size_t index = static_cast<data_size_t>(threadIdx.x + blockIdx.x * blockDim.x);
2727
score_t grad_max_val = kMinScore;
2828
score_t grad_min_val = kMaxScore;
@@ -56,7 +56,7 @@ __global__ void ReduceBlockMinMaxKernel(
5656
score_t* grad_max_block_buffer,
5757
score_t* hess_min_block_buffer,
5858
score_t* hess_max_block_buffer) {
59-
__shared__ score_t shared_mem_buffer[32];
59+
__shared__ score_t shared_mem_buffer[WARPSIZE];
6060
score_t grad_max_val = kMinScore;
6161
score_t grad_min_val = kMaxScore;
6262
score_t hess_max_val = kMinScore;

src/treelearner/cuda/cuda_histogram_constructor.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -835,7 +835,7 @@ __global__ void FixHistogramDiscretizedKernel(
835835
const int* cuda_need_fix_histogram_features,
836836
const uint32_t* cuda_need_fix_histogram_features_num_bin_aligned,
837837
const CUDALeafSplitsStruct* cuda_smaller_leaf_splits) {
838-
__shared__ int64_t shared_mem_buffer[32];
838+
__shared__ int64_t shared_mem_buffer[WARPSIZE];
839839
const unsigned int blockIdx_x = blockIdx.x;
840840
const int feature_index = cuda_need_fix_histogram_features[blockIdx_x];
841841
const uint32_t num_bin_aligned = cuda_need_fix_histogram_features_num_bin_aligned[blockIdx_x];

src/treelearner/cuda/cuda_leaf_splits.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ __global__ void CUDAInitValuesKernel3(const int16_t* cuda_gradients_and_hessians
9090
const score_t* grad_scale_pointer, const score_t* hess_scale_pointer) {
9191
const score_t grad_scale = *grad_scale_pointer;
9292
const score_t hess_scale = *hess_scale_pointer;
93-
__shared__ int64_t shared_mem_buffer[32];
93+
__shared__ int64_t shared_mem_buffer[WARPSIZE];
9494
const data_size_t data_index = static_cast<data_size_t>(threadIdx.x + blockIdx.x * blockDim.x);
9595
int64_t int_gradient = 0;
9696
int64_t int_hessian = 0;
@@ -121,7 +121,7 @@ __global__ void CUDAInitValuesKernel4(
121121
const data_size_t* cuda_data_indices_in_leaf,
122122
hist_t* cuda_hist_in_leaf,
123123
CUDALeafSplitsStruct* cuda_struct) {
124-
__shared__ double shared_mem_buffer[32];
124+
__shared__ double shared_mem_buffer[WARPSIZE];
125125
double thread_sum_of_gradients = 0.0f;
126126
double thread_sum_of_hessians = 0.0f;
127127
int64_t thread_sum_of_gradients_hessians = 0;

0 commit comments

Comments
 (0)