misc: make max_top_p/k_rounds a input argument instead of template parameter (#219)

yzh119 · web-flow · commit 4b27040fe833 · 2024-04-26T17:12:37.000-07:00
max_top_p/k_rounds doesn't have to be a template parameter.
diff --git a/include/flashinfer/sampling.cuh b/include/flashinfer/sampling.cuh
@@ -146,10 +146,11 @@ __global__ void SamplingFromProbKernel(DType* probs, DType* uniform_samples, IdT
   output[bx] = (aggregate > u) ? temp_storage.data.sampled_id : d - 1;
 }
 
-template <uint32_t MAX_TOP_K_ROUNDS, uint32_t BLOCK_THREADS, BlockScanAlgorithm ALGORITHM,
-          uint32_t VEC_SIZE, typename DType, typename IdType>
+template <uint32_t BLOCK_THREADS, BlockScanAlgorithm ALGORITHM, uint32_t VEC_SIZE, typename DType,
+          typename IdType>
 __global__ void TopKSamplingFromProbKernel(DType* probs, DType* uniform_samples, IdType* output,
-                                           bool* success, uint32_t k, uint32_t d) {
+                                           bool* success, uint32_t k, uint32_t d,
+                                           uint32_t max_top_k_rounds) {
   const uint32_t batch_size = gridDim.x;
   const uint32_t bx = blockIdx.x, tx = threadIdx.x;
 
@@ -163,7 +164,7 @@ __global__ void TopKSamplingFromProbKernel(DType* probs, DType* uniform_samples,
   DType q = DType(0);
   DType pivot = DType(0);
   IdType sampled_id;
-  for (uint32_t round = 0; round < MAX_TOP_K_ROUNDS; ++round) {
+  for (uint32_t round = 0; round < max_top_k_rounds; ++round) {
     DType u = uniform_samples[round * batch_size + bx] * (1 - q);
     aggregate = DType(0);
     for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
@@ -230,11 +231,11 @@ __global__ void TopKSamplingFromProbKernel(DType* probs, DType* uniform_samples,
 
 constexpr float eps = 1e-5;
 
-template <uint32_t MAX_TOP_P_ROUNDS, uint32_t BLOCK_THREADS, BlockScanAlgorithm ALGORITHM,
-          uint32_t VEC_SIZE, typename DType, typename IdType>
+template <uint32_t BLOCK_THREADS, BlockScanAlgorithm ALGORITHM, uint32_t VEC_SIZE, typename DType,
+          typename IdType>
 __global__ void TopPSamplingFromProbKernel(DType* probs, DType* uniform_samples, IdType* output,
                                            bool* success, IdType* row_indices, float* top_p_arr,
-                                           float top_p, uint32_t d) {
+                                           float top_p, uint32_t d, uint32_t max_top_p_rounds) {
   const uint32_t batch_size = gridDim.x;
   const uint32_t bx = blockIdx.x, tx = threadIdx.x;
 
@@ -253,7 +254,7 @@ __global__ void TopPSamplingFromProbKernel(DType* probs, DType* uniform_samples,
   DType q = DType(0);
   DType pivot = DType(0);
   IdType sampled_id;
-  for (uint32_t round = 0; round < MAX_TOP_P_ROUNDS; ++round) {
+  for (uint32_t round = 0; round < max_top_p_rounds; ++round) {
     DType u = uniform_samples[round * batch_size + bx] * (1 - q);
     aggregate = DType(0);
     for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
@@ -356,33 +357,33 @@ cudaError_t ParallelSamplingFromProb(T* probs, T* uniform_samples, IdType* outpu
   return cudaSuccess;
 }
 
-template <uint32_t MAX_TOP_K_ROUNDS, typename T, typename IdType>
+template <typename T, typename IdType>
 cudaError_t TopKSamplingFromProb(T* probs, T* uniform_samples, IdType* output, bool* success,
                                  IdType top_k, uint32_t batch_size, uint32_t d,
-                                 cudaStream_t stream = 0) {
+                                 uint32_t max_top_k_rounds, cudaStream_t stream = 0) {
   constexpr uint32_t BLOCK_THREADS = 1024;
   const uint32_t vec_size = std::gcd(16 / sizeof(T), d);
 
   const uint32_t smem_size =
       sizeof(SamplingTempStorage<T, BLOCK_THREADS, BLOCK_SCAN_RAKING_MEMOIZE>);
   dim3 nblks(batch_size);
   dim3 nthrs(BLOCK_THREADS);
-  void* args[] = {&probs, &uniform_samples, &output, &success, &top_k, &d};
+  void* args[] = {&probs, &uniform_samples, &output, &success, &top_k, &d, &max_top_k_rounds};
 
   DISPATCH_ALIGNED_VEC_SIZE(vec_size, VEC_SIZE, {
-    auto kernel = TopKSamplingFromProbKernel<MAX_TOP_K_ROUNDS, BLOCK_THREADS,
-                                             BLOCK_SCAN_RAKING_MEMOIZE, VEC_SIZE, T, IdType>;
+    auto kernel =
+        TopKSamplingFromProbKernel<BLOCK_THREADS, BLOCK_SCAN_RAKING_MEMOIZE, VEC_SIZE, T, IdType>;
     FLASHINFER_CUDA_CALL(
         cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
     FLASHINFER_CUDA_CALL(cudaLaunchKernel((void*)kernel, nblks, nthrs, args, smem_size, stream));
   });
   return cudaSuccess;
 }
 
-template <uint32_t MAX_TOP_P_ROUNDS, typename T, typename IdType>
+template <typename T, typename IdType>
 cudaError_t TopPSamplingFromProb(T* probs, T* uniform_samples, IdType* output, bool* success,
                                  T top_p, uint32_t batch_size, uint32_t d,
-                                 cudaStream_t stream = 0) {
+                                 uint32_t max_top_p_rounds, cudaStream_t stream = 0) {
   constexpr uint32_t BLOCK_THREADS = 1024;
   const uint32_t vec_size = std::gcd(16 / sizeof(T), d);
 
@@ -399,22 +400,24 @@ cudaError_t TopPSamplingFromProb(T* probs, T* uniform_samples, IdType* output, b
                   &row_indices_placeholder,
                   &top_p_arr_placeholder,
                   &top_p,
-                  &d};
+                  &d,
+                  &max_top_p_rounds};
 
   DISPATCH_ALIGNED_VEC_SIZE(vec_size, VEC_SIZE, {
-    auto kernel = TopPSamplingFromProbKernel<MAX_TOP_P_ROUNDS, BLOCK_THREADS,
-                                             BLOCK_SCAN_RAKING_MEMOIZE, VEC_SIZE, T, IdType>;
+    auto kernel =
+        TopPSamplingFromProbKernel<BLOCK_THREADS, BLOCK_SCAN_RAKING_MEMOIZE, VEC_SIZE, T, IdType>;
     FLASHINFER_CUDA_CALL(
         cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
     FLASHINFER_CUDA_CALL(cudaLaunchKernel((void*)kernel, nblks, nthrs, args, smem_size, stream));
   });
   return cudaSuccess;
 }
 
-template <uint32_t MAX_TOP_P_ROUNDS, typename T, typename IdType>
+template <typename T, typename IdType>
 cudaError_t ParallelTopPSamplingFromProb(T* probs, T* uniform_samples, IdType* output,
                                          bool* success, IdType* row_indices, T* top_p_arr,
-                                         uint32_t batch_size, uint32_t d, cudaStream_t stream = 0) {
+                                         uint32_t batch_size, uint32_t d, uint32_t max_top_p_rounds,
+                                         cudaStream_t stream = 0) {
   constexpr uint32_t BLOCK_THREADS = 1024;
   const uint32_t vec_size = std::gcd(16 / sizeof(T), d);
 
@@ -423,12 +426,12 @@ cudaError_t ParallelTopPSamplingFromProb(T* probs, T* uniform_samples, IdType* o
   dim3 nblks(batch_size);
   dim3 nthrs(BLOCK_THREADS);
   T top_p_placeholder = 0;
-  void* args[] = {&probs,     &uniform_samples,   &output, &success, &row_indices,
-                  &top_p_arr, &top_p_placeholder, &d};
+  void* args[] = {&probs,     &uniform_samples,   &output, &success,         &row_indices,
+                  &top_p_arr, &top_p_placeholder, &d,      &max_top_p_rounds};
 
   DISPATCH_ALIGNED_VEC_SIZE(vec_size, VEC_SIZE, {
-    auto kernel = TopPSamplingFromProbKernel<MAX_TOP_P_ROUNDS, BLOCK_THREADS,
-                                             BLOCK_SCAN_RAKING_MEMOIZE, VEC_SIZE, T, IdType>;
+    auto kernel =
+        TopPSamplingFromProbKernel<BLOCK_THREADS, BLOCK_SCAN_RAKING_MEMOIZE, VEC_SIZE, T, IdType>;
     FLASHINFER_CUDA_CALL(
         cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
     FLASHINFER_CUDA_CALL(cudaLaunchKernel((void*)kernel, nblks, nthrs, args, smem_size, stream));
diff --git a/src/bench_sampling.cu b/src/bench_sampling.cu
@@ -96,11 +96,11 @@ void bench_top_p_sampling_with_probability(nvbench::state& state) {
 
   state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
     timer.start();
-    cudaError_t status = sampling::TopPSamplingFromProb<max_top_p_rounds, T, int32_t>(
+    cudaError_t status = sampling::TopPSamplingFromProb<T, int32_t>(
         thrust::raw_pointer_cast(probs_d.data()),
         thrust::raw_pointer_cast(uniform_samples_d.data()),
         thrust::raw_pointer_cast(output_d.data()), thrust::raw_pointer_cast(success_d.data()), p,
-        batch_size, vocab_size);
+        batch_size, vocab_size, max_top_p_rounds);
     timer.stop();
     if (status != cudaSuccess) {
       state.skip("CUDA error: " + std::string(cudaGetErrorString(status)));
@@ -141,11 +141,11 @@ void bench_top_k_sampling_with_probability(nvbench::state& state) {
 
   state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
     timer.start();
-    cudaError_t status = sampling::TopKSamplingFromProb<max_top_k_rounds, T, int32_t>(
+    cudaError_t status = sampling::TopKSamplingFromProb<T, int32_t>(
         thrust::raw_pointer_cast(probs_d.data()),
         thrust::raw_pointer_cast(uniform_samples_d.data()),
         thrust::raw_pointer_cast(output_d.data()), thrust::raw_pointer_cast(success_d.data()), k,
-        batch_size, vocab_size);
+        batch_size, vocab_size, max_top_k_rounds);
     timer.stop();
     if (status != cudaSuccess) {
       state.skip("CUDA error: " + std::string(cudaGetErrorString(status)));
diff --git a/src/test_sampling.cu b/src/test_sampling.cu
@@ -56,11 +56,11 @@ void _TestTopKSamplingFromProb(size_t batch_size, uint32_t k, size_t vocab_size)
     utils::vec_uniform_<T>(uniform_samples_h, 0, 1);
     thrust::device_vector<T> uniform_samples_d(uniform_samples_h);
 
-    auto status = sampling::TopKSamplingFromProb<max_top_p_rounds, T, IdType>(
+    auto status = sampling::TopKSamplingFromProb<T, IdType>(
         thrust::raw_pointer_cast(probs_d.data()),
         thrust::raw_pointer_cast(uniform_samples_d.data()),
         thrust::raw_pointer_cast(sampled_ids_d.data()), thrust::raw_pointer_cast(success_d.data()),
-        k, batch_size, vocab_size);
+        k, batch_size, vocab_size, max_top_p_rounds);
 
     EXPECT_EQ(status, cudaSuccess) << "TopKSamplingFromProb kernel launch failed, error message: "
                                    << cudaGetErrorString(status);
@@ -121,11 +121,11 @@ void _TestTopPSamplingFromProb(size_t batch_size, uint32_t k, size_t vocab_size)
     utils::vec_uniform_<T>(uniform_samples_h, 0, 1);
     thrust::device_vector<T> uniform_samples_d(uniform_samples_h);
 
-    auto status = sampling::TopPSamplingFromProb<max_top_p_rounds, T, IdType>(
+    auto status = sampling::TopPSamplingFromProb<T, IdType>(
         thrust::raw_pointer_cast(probs_d.data()),
         thrust::raw_pointer_cast(uniform_samples_d.data()),
         thrust::raw_pointer_cast(sampled_ids_d.data()), thrust::raw_pointer_cast(success_d.data()),
-        p, batch_size, vocab_size);
+        p, batch_size, vocab_size, max_top_p_rounds);
 
     EXPECT_EQ(status, cudaSuccess) << "TopPSamplingFromProb kernel launch failed, error message: "
                                    << cudaGetErrorString(status);
diff --git a/src/tvm_wrapper.cu b/src/tvm_wrapper.cu
@@ -48,26 +48,6 @@ using namespace flashinfer;
     LOG(FATAL) << "Unsupported data type " << dl_dtype.code; \
   }
 
-#define DISPATCH_REJECTIVE_SAMPLING_NUM_ROUNDS(num_rounds, NUM_ROUNDS, ...)         \
-  if (num_rounds == 1) {                                                            \
-    constexpr bool NUM_ROUNDS = 1;                                                  \
-    __VA_ARGS__                                                                     \
-  } else if (num_rounds == 2) {                                                     \
-    constexpr bool NUM_ROUNDS = 2;                                                  \
-    __VA_ARGS__                                                                     \
-  } else if (num_rounds == 4) {                                                     \
-    constexpr bool NUM_ROUNDS = 4;                                                  \
-    __VA_ARGS__                                                                     \
-  } else if (num_rounds == 8) {                                                     \
-    constexpr bool NUM_ROUNDS = 8;                                                  \
-    __VA_ARGS__                                                                     \
-  } else if (num_rounds == 16) {                                                    \
-    constexpr bool NUM_ROUNDS = 16;                                                 \
-    __VA_ARGS__                                                                     \
-  } else {                                                                          \
-    LOG(FATAL) << "Unsupported number of rejective sampling rounds " << num_rounds; \
-  }
-
 int _FlashInferSinglePrefillWithKVCache(DLTensor* q, DLTensor* k, DLTensor* v, DLTensor* tmp,
                                         bool causal, int64_t kv_layout, int64_t pos_encoding_mode,
                                         bool allow_fp16_qk_reduction, double rope_scale,
@@ -739,7 +719,7 @@ void _FlashInferParallelSamplingFromProb(DLTensor* probs, DLTensor* uniform_samp
 
 void _FlashInferParallelTopPSamplingFromProb(DLTensor* probs, DLTensor* uniform_samples,
                                              DLTensor* row_indices, DLTensor* top_p,
-                                             DLTensor* sampled_token_ids, int num_rounds) {
+                                             DLTensor* sampled_token_ids) {
   CHECK_EQ(probs->device.device_type, kDLCUDA) << "The device of probs must be CUDA.";
   CHECK_EQ(uniform_samples->device.device_type, kDLCUDA)
       << "The device of uniform_samples must be CUDA.";
@@ -764,28 +744,26 @@ void _FlashInferParallelTopPSamplingFromProb(DLTensor* probs, DLTensor* uniform_
   CHECK(sampled_token_ids->dtype.code == kDLInt && sampled_token_ids->dtype.bits == 32);
 
   CHECK_EQ(probs->ndim, 2);              // num_probs, vocab_size
-  CHECK_EQ(uniform_samples->ndim, 1);    // batch_size * num_rounds,
+  CHECK_EQ(uniform_samples->ndim, 2);    // num_rounds, batch_size
   CHECK_EQ(row_indices->ndim, 1);        // batch_size,
   CHECK_EQ(top_p->ndim, 1);              // num_probs,
   CHECK_EQ(sampled_token_ids->ndim, 1);  // batch_size,
   int64_t num_probs = probs->shape[0];
   int64_t vocab_size = probs->shape[1];
   int64_t batch_size = row_indices->shape[0];
-  CHECK_EQ(uniform_samples->shape[0], batch_size * num_rounds);
+  int64_t num_rounds = uniform_samples->shape[0];
+  CHECK_EQ(uniform_samples->shape[1], batch_size);
   CHECK_EQ(top_p->shape[0], num_probs);
   CHECK_EQ(sampled_token_ids->shape[0], batch_size);
 
-  DISPATCH_REJECTIVE_SAMPLING_NUM_ROUNDS(num_rounds, rej_samping_num_rounds, {
-    cudaError_t status =
-        sampling::ParallelTopPSamplingFromProb<rej_samping_num_rounds, float, int32_t>(
-            static_cast<float*>(probs->data), static_cast<float*>(uniform_samples->data),
-            static_cast<int32_t*>(sampled_token_ids->data), /*success=*/nullptr,
-            static_cast<int32_t*>(row_indices->data), static_cast<float*>(top_p->data), batch_size,
-            vocab_size);
-    if (status != cudaSuccess) {
-      LOG(FATAL) << "FlashInfer ParallelTopPSamplingFromProb error " << cudaGetErrorString(status);
-    }
-  });
+  cudaError_t status = sampling::ParallelTopPSamplingFromProb<float, int32_t>(
+      static_cast<float*>(probs->data), static_cast<float*>(uniform_samples->data),
+      static_cast<int32_t*>(sampled_token_ids->data), /*success=*/nullptr,
+      static_cast<int32_t*>(row_indices->data), static_cast<float*>(top_p->data), batch_size,
+      vocab_size, num_rounds);
+  if (status != cudaSuccess) {
+    LOG(FATAL) << "FlashInfer ParallelTopPSamplingFromProb error " << cudaGetErrorString(status);
+  }
 }
 
 TVM_REGISTER_GLOBAL("flashinfer.attention_kernel_prefill_with_paged_kv_cache")