flashinfer-ai
diff --git a/‎include/flashinfer/utils.cuh
-15 b/‎include/flashinfer/utils.cuh
-15
diff --git a/‎python/csrc/single_prefill.cu
+4-8 b/‎python/csrc/single_prefill.cu
+4-8
diff --git a/‎python/flashinfer/cascade.py
+5-5 b/‎python/flashinfer/cascade.py
+5-5
@@ -91,24 +91,9 @@
   if (group_size == 1) {                                     \
     constexpr size_t GROUP_SIZE = 1;                         \
     __VA_ARGS__                                              \
-  } else if (group_size == 2) {                              \
-    constexpr size_t GROUP_SIZE = 2;                         \
-    __VA_ARGS__                                              \
-  } else if (group_size == 3) {                              \
-    constexpr size_t GROUP_SIZE = 3;                         \
-    __VA_ARGS__                                              \
   } else if (group_size == 4) {                              \
     constexpr size_t GROUP_SIZE = 4;                         \
     __VA_ARGS__                                              \
-  } else if (group_size == 5) {                              \
-    constexpr size_t GROUP_SIZE = 5;                         \
-    __VA_ARGS__                                              \
-  } else if (group_size == 6) {                              \
-    constexpr size_t GROUP_SIZE = 6;                         \
-    __VA_ARGS__                                              \
-  } else if (group_size == 7) {                              \
-    constexpr size_t GROUP_SIZE = 7;                         \
-    __VA_ARGS__                                              \
   } else if (group_size == 8) {                              \
     constexpr size_t GROUP_SIZE = 8;                         \
     __VA_ARGS__                                              \
 
@@ -38,16 +38,14 @@ std::vector<torch::Tensor> single_prefill_with_kv_cache(
   unsigned int head_dim = q.size(2);
   unsigned int kv_len, qo_len, num_kv_heads, num_qo_heads;
   QKVLayout kv_layout = static_cast<QKVLayout>(layout);
+  qo_len = q.size(0);
+  num_qo_heads = q.size(1);
   if (kv_layout == QKVLayout::kNHD) {
     kv_len = k.size(0);
-    qo_len = q.size(0);
     num_kv_heads = k.size(1);
-    num_qo_heads = q.size(1);
   } else {
     kv_len = k.size(1);
-    qo_len = q.size(1);
     num_kv_heads = k.size(0);
-    num_qo_heads = q.size(0);
   }
   CHECK_GQA_HEAD_DIVISIBLE(num_qo_heads, num_kv_heads);
   cudaStream_t torch_current_stream = c10::cuda::getCurrentCUDAStream();
@@ -122,16 +120,14 @@ std::vector<torch::Tensor> single_prefill_with_kv_cache_custom_mask(
   unsigned int head_dim = q.size(2);
   unsigned int kv_len, qo_len, num_kv_heads, num_qo_heads;
   QKVLayout kv_layout = static_cast<QKVLayout>(layout);
+  qo_len = q.size(0);
+  num_qo_heads = q.size(1);
   if (kv_layout == QKVLayout::kNHD) {
     kv_len = k.size(0);
-    qo_len = q.size(0);
     num_kv_heads = k.size(1);
-    num_qo_heads = q.size(1);
   } else {
     kv_len = k.size(1);
-    qo_len = q.size(1);
     num_kv_heads = k.size(0);
-    num_qo_heads = q.size(0);
   }
   CHECK_GQA_HEAD_DIVISIBLE(num_qo_heads, num_kv_heads);
   cudaStream_t torch_current_stream = c10::cuda::getCurrentCUDAStream();
 
@@ -307,8 +307,8 @@ class BatchDecodeWithSharedPrefixPagedKVCacheWrapper:
     >>> head_dim = 128
     >>> max_num_pages = 128
     >>> page_size = 16
-    >>> # allocate 16MB workspace buffer
-    >>> workspace_buffer = torch.empty(16 * 1024 * 1024, dtype=torch.uint8, device="cuda:0")
+    >>> # allocate 128MB workspace buffer
+    >>> workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device="cuda:0")
     >>> wrapper = flashinfer.BatchDecodeWithSharedPrefixPagedKVCacheWrapper(
     ...     workspace_buffer, "NHD"
     ... )
@@ -540,8 +540,8 @@ class BatchPrefillWithSharedPrefixPagedKVCacheWrapper:
     >>> head_dim = 128
     >>> max_num_pages = 128
     >>> page_size = 16
-    >>> # allocate 16MB workspace buffer
-    >>> workspace_buffer = torch.empty(16 * 1024 * 1024, dtype=torch.uint8, device="cuda:0")
+    >>> # allocate 128MB workspace buffer
+    >>> workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device="cuda:0")
     >>> prefill_wrapper = flashinfer.BatchPrefillWithSharedPrefixPagedKVCacheWrapper(
     ...     workspace_buffer, "NHD"
     ... )
@@ -617,7 +617,7 @@ def __init__(self, workspace_buffer: torch.Tensor, kv_layout: str = "NHD"):
         ----------
         workspace_buffer : torch.Tensor
             The user reserved workspace buffer used to store auxiliary data structures,
-            recommended size is 16MB, the device of the workspace buffer should be the
+            recommended size is 128MB, the device of the workspace buffer should be the
             same as the device of the input tensors.
         kv_layout : str
             The layout of the input k/v tensors, could be either ``NHD`` or ``HND``.