bugfix: fix cudagraph-compatible prefill/decode apis (#281)

yzh119 · web-flow · commit 1092e7e9abd1 · 2024-06-03T22:21:43.000-07:00
The `indptr` array length should be a upper-bound of `batch_size + 1` in
cuda graph mode.
diff --git a/include/flashinfer/attention/handler.cuh b/include/flashinfer/attention/handler.cuh
@@ -420,7 +420,7 @@ class BatchDecodeHandler {
    * \note (Zihao): when enable_cuda_graph is true, max_workspace_size_in_bytes will be ignored,
    *                when enable_cuda_graph is false, max_batch_size will be ignored.
    */
-  BatchDecodeHandler(size_t max_workspace_size_in_bytes = 128 * 64 * 64,
+  BatchDecodeHandler(size_t max_workspace_size_in_bytes = 128 * 1024 * 1024,
                      size_t max_batch_size = 16384, bool enable_cuda_graph = false)
       : batch_size_after_partition_(0U),
         float_buffer_(nullptr),
diff --git a/python/csrc/batch_decode.cu b/python/csrc/batch_decode.cu
@@ -222,8 +222,8 @@ std::vector<torch::Tensor> BatchDecodeWithPagedKVCachePyTorchWrapper::Forward(
   }
   CHECK_EQ(paged_kv_data.size(1), 2);
   CHECK_EQ(paged_kv_data.size(4), head_dim);
-  CHECK_EQ(paged_kv_indptr.size(0), batch_size + 1);
-  CHECK_EQ(paged_kv_last_page_len.size(0), batch_size);
+  CHECK_GE(paged_kv_indptr.size(0), batch_size + 1);
+  CHECK_GE(paged_kv_last_page_len.size(0), batch_size);
   // TODO(Zihao): support dispatching to different data types
   CHECK_EQ(paged_kv_indptr.scalar_type(), torch::kInt32);
   CHECK_EQ(paged_kv_indices.scalar_type(), torch::kInt32);
diff --git a/python/csrc/batch_prefill.cu b/python/csrc/batch_prefill.cu
@@ -83,9 +83,9 @@ std::vector<torch::Tensor> BatchPrefillWithPagedKVCachePyTorchWrapper::Forward(
     num_kv_heads = paged_kv_data.size(3);
   }
   CHECK_GQA_HEAD_DIVISIBLE(num_qo_heads, num_kv_heads);
-  CHECK_EQ(qo_indptr.size(0), batch_size + 1);
-  CHECK_EQ(paged_kv_indptr.size(0), batch_size + 1);
-  CHECK_EQ(paged_kv_last_page_len.size(0), batch_size);
+  CHECK_GE(qo_indptr.size(0), batch_size + 1);
+  CHECK_GE(paged_kv_indptr.size(0), batch_size + 1);
+  CHECK_GE(paged_kv_last_page_len.size(0), batch_size);
   CHECK_EQ(paged_kv_data.size(1), 2);
   CHECK_EQ(paged_kv_data.size(4), head_dim);
   qo_indptr = qo_indptr.to(torch::kInt32);
@@ -186,12 +186,12 @@ std::vector<torch::Tensor> BatchPrefillWithPagedKVCachePyTorchWrapper::ForwardCu
     num_kv_heads = paged_kv_data.size(3);
   }
   CHECK_GQA_HEAD_DIVISIBLE(num_qo_heads, num_kv_heads);
-  CHECK_EQ(qo_indptr.size(0), batch_size + 1);
-  CHECK_EQ(paged_kv_indptr.size(0), batch_size + 1);
-  CHECK_EQ(paged_kv_last_page_len.size(0), batch_size);
+  CHECK_GE(qo_indptr.size(0), batch_size + 1);
+  CHECK_GE(paged_kv_indptr.size(0), batch_size + 1);
+  CHECK_GE(paged_kv_last_page_len.size(0), batch_size);
   CHECK_EQ(paged_kv_data.size(1), 2);
   CHECK_EQ(paged_kv_data.size(4), head_dim);
-  CHECK_EQ(qk_indptr.size(0), batch_size + 1);
+  CHECK_GE(qk_indptr.size(0), batch_size + 1);
   qo_indptr = qo_indptr.to(torch::kInt32);
   paged_kv_indptr = paged_kv_indptr.to(torch::kInt32);
   paged_kv_indices = paged_kv_indices.to(torch::kInt32);
@@ -303,7 +303,7 @@ std::vector<torch::Tensor> BatchPrefillWithRaggedKVCachePyTorchWrapper::Forward(
   int64_t nnz_qo = q.size(0);
   int64_t num_qo_heads = q.size(1);
   int64_t head_dim = q.size(2);
-  CHECK_EQ(kv_indptr.size(0), batch_size + 1);
+  CHECK_GE(kv_indptr.size(0), batch_size + 1);
   int64_t num_kv_heads = (kv_layout_ == QKVLayout::kNHD) ? k.size(1) : k.size(0);
   CHECK_EQ(k.size(0), v.size(0));
   CHECK_EQ(k.size(1), v.size(1));
@@ -366,8 +366,8 @@ std::vector<torch::Tensor> BatchPrefillWithRaggedKVCachePyTorchWrapper::Forward(
 std::vector<torch::Tensor> BatchPrefillWithRaggedKVCachePyTorchWrapper::ForwardCustomMask(
     torch::Tensor q, torch::Tensor qo_indptr, torch::Tensor k, torch::Tensor v,
     torch::Tensor kv_indptr, torch::Tensor custom_mask, torch::Tensor qk_indptr,
-    unsigned int pos_encoding_mode, bool allow_fp16_qk_reduction, float sm_scale, float rope_scale,
-    float rope_theta, bool return_lse) {
+    unsigned int pos_encoding_mode, bool allow_fp16_qk_reduction,
+    float sm_scale, float rope_scale, float rope_theta, bool return_lse) {
   CHECK_INPUT(q);
   CHECK_INPUT(qo_indptr);
   CHECK_INPUT(k);
@@ -386,8 +386,8 @@ std::vector<torch::Tensor> BatchPrefillWithRaggedKVCachePyTorchWrapper::ForwardC
   int64_t nnz_qo = q.size(0);
   int64_t num_qo_heads = q.size(1);
   int64_t head_dim = q.size(2);
-  CHECK_EQ(kv_indptr.size(0), batch_size + 1);
-  CHECK_EQ(qk_indptr.size(0), batch_size + 1);
+  CHECK_GE(kv_indptr.size(0), batch_size + 1);
+  CHECK_GE(qk_indptr.size(0), batch_size + 1);
   int64_t num_kv_heads = (kv_layout_ == QKVLayout::kNHD) ? k.size(1) : k.size(0);
   CHECK_EQ(k.size(0), v.size(0));
   CHECK_EQ(k.size(1), v.size(1));
diff --git a/python/tests/test_batch_decode_kernels.py b/python/tests/test_batch_decode_kernels.py
@@ -156,9 +156,10 @@ def test_cuda_graph_batch_decode_with_paged_kv_cache(
         (batch_size,), (kv_len - 1) % page_size + 1, dtype=torch.int32
     )
 
-    kv_indptr_device_buffer = torch.empty(batch_size + 1).int().to(0)
-    kv_indices_device_buffer = torch.empty(total_num_pages).int().to(0)
-    kv_last_page_device_buffer = torch.empty(batch_size).int().to(0)
+    # NOTE(Zihao): allocate more space than needed for testing
+    kv_indptr_device_buffer = torch.empty(batch_size + 11).int().to(0)
+    kv_indices_device_buffer = torch.empty(total_num_pages + 10).int().to(0)
+    kv_last_page_device_buffer = torch.empty(batch_size + 10).int().to(0)
 
     workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8).to(0)
     wrapper = flashinfer.CUDAGraphBatchDecodeWithPagedKVCacheWrapper(