Skip to content

Commit 6732b79

Browse files
author
Yu Shi
committed
use WARPSIZE
1 parent 8591248 commit 6732b79

File tree

2 files changed

+3
-3
lines changed

2 files changed

+3
-3
lines changed

src/objective/cuda/cuda_rank_objective.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -527,7 +527,7 @@ __global__ void GetGradientsKernel_RankXENDCG_GlobalMemory(
527527
double* cuda_params_buffer_pointer = cuda_params_buffer + item_index_start;
528528
const data_size_t block_reduce_size = query_item_count > 1024 ? 1024 : query_item_count;
529529
// assert that warpSize == 32, so we use buffer size 1024 / 32 = 32
530-
__shared__ double shared_buffer[WARPSIZE];
530+
__shared__ double shared_buffer[1024 / WARPSIZE];
531531
__shared__ double reduce_result;
532532
if (query_item_count <= 1) {
533533
for (data_size_t i = 0; i <= query_item_count; ++i) {

src/treelearner/cuda/cuda_single_gpu_tree_learner.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ __global__ void CalcBitsetLenKernel(const CUDASplitInfo* best_split_info, size_t
180180
size_t len = 0;
181181
if (i < best_split_info->num_cat_threshold) {
182182
const T val = vals[i];
183-
len = (val / 32) + 1;
183+
len = (val / WARPSIZE) + 1;
184184
}
185185
const size_t block_max_len = ShuffleReduceMax<size_t>(len, shared_mem_buffer, blockDim.x);
186186
if (threadIdx.x == 0) {
@@ -212,7 +212,7 @@ __global__ void CUDAConstructBitsetKernel(const CUDASplitInfo* best_split_info,
212212
if (i < best_split_info->num_cat_threshold) {
213213
const T val = vals[i];
214214
// can use add instead of or here, because each bit will only be added once
215-
atomicAdd_system(out + (val / 32), (0x1 << (val % 32)));
215+
atomicAdd_system(out + (val / WARPSIZE), (0x1 << (val % WARPSIZE)));
216216
}
217217
}
218218

0 commit comments

Comments
 (0)