use WARPSIZE

Yu Shi · Yu Shi · commit 6732b79b0733 · 2025-06-09T20:43:36.000-07:00
diff --git a/src/objective/cuda/cuda_rank_objective.cu b/src/objective/cuda/cuda_rank_objective.cu
@@ -527,7 +527,7 @@ __global__ void GetGradientsKernel_RankXENDCG_GlobalMemory(
     double* cuda_params_buffer_pointer = cuda_params_buffer + item_index_start;
     const data_size_t block_reduce_size = query_item_count > 1024 ? 1024 : query_item_count;
     // assert that warpSize == 32, so we use buffer size 1024 / 32 = 32
-    __shared__ double shared_buffer[WARPSIZE];
+    __shared__ double shared_buffer[1024 / WARPSIZE];
     __shared__ double reduce_result;
     if (query_item_count <= 1) {
       for (data_size_t i = 0; i <= query_item_count; ++i) {
diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu
@@ -180,7 +180,7 @@ __global__ void CalcBitsetLenKernel(const CUDASplitInfo* best_split_info, size_t
   size_t len = 0;
   if (i < best_split_info->num_cat_threshold) {
     const T val = vals[i];
-    len = (val / 32) + 1;
+    len = (val / WARPSIZE) + 1;
   }
   const size_t block_max_len = ShuffleReduceMax<size_t>(len, shared_mem_buffer, blockDim.x);
   if (threadIdx.x == 0) {
@@ -212,7 +212,7 @@ __global__ void CUDAConstructBitsetKernel(const CUDASplitInfo* best_split_info,
   if (i < best_split_info->num_cat_threshold) {
     const T val = vals[i];
     // can use add instead of or here, because each bit will only be added once
-    atomicAdd_system(out + (val / 32), (0x1 << (val % 32)));
+    atomicAdd_system(out + (val / WARPSIZE), (0x1 << (val % WARPSIZE)));
   }
 }
 

Original file line number	Diff line number	Diff line change
`@@ -180,7 +180,7 @@ __global__ void CalcBitsetLenKernel(const CUDASplitInfo* best_split_info, size_t`
`180`	`180`	`size_t len = 0;`
`181`	`181`	`if (i < best_split_info->num_cat_threshold) {`
`182`	`182`	`const T val = vals[i];`
`183`		`- len = (val / 32) + 1;`
	`183`	`+ len = (val / WARPSIZE) + 1;`
`184`	`184`	`}`
`185`	`185`	`const size_t block_max_len = ShuffleReduceMax<size_t>(len, shared_mem_buffer, blockDim.x);`
`186`	`186`	`if (threadIdx.x == 0) {`
`@@ -212,7 +212,7 @@ __global__ void CUDAConstructBitsetKernel(const CUDASplitInfo* best_split_info,`
`212`	`212`	`if (i < best_split_info->num_cat_threshold) {`
`213`	`213`	`const T val = vals[i];`
`214`	`214`	`// can use add instead of or here, because each bit will only be added once`
`215`		`- atomicAdd_system(out + (val / 32), (0x1 << (val % 32)));`
	`215`	`+ atomicAdd_system(out + (val / WARPSIZE), (0x1 << (val % WARPSIZE)));`
`216`	`216`	`}`
`217`	`217`	`}`
`218`	`218`