flashinfer-ai · yzh119 · Apr 30, 2025 · Apr 10, 2025 · Apr 11, 2025 · Apr 11, 2025
diff --git a/aot_build_utils/generate_aot_default_additional_params_header.py b/aot_build_utils/generate_aot_default_additional_params_header.py
@@ -125,23 +125,35 @@ def get_aot_default_additional_params_header_str() -> str:
 
     ret += generate_macro_entry(
         "BATCH_PREFILL",
-        ["maybe_custom_mask", "maybe_mask_indptr", "maybe_alibi_slopes"],
-        ["uint8_t", "int32_t", "float"],
+        [
+            "maybe_custom_mask",
+            "maybe_mask_indptr",
+            "maybe_alibi_slopes",
+            "maybe_prefix_len_ptr",
+            "maybe_token_pos_in_items_ptr",
+            "maybe_max_item_len_ptr",
+        ],
+        ["uint8_t", "int32_t", "float", "uint32_t", "uint16_t", "uint16_t"],
         [
             "logits_soft_cap",
             "sm_scale",
             "rope_rcp_scale",
             "rope_rcp_theta",
+            "token_pos_in_items_len",
         ],
-        ["double", "double", "double", "double"],
+        ["double", "double", "double", "double", "int64_t"],
     )
 
     ret += generate_macro_entry(
         "BATCH_PREFILL_SM90",
-        [],
-        [],
-        ["logits_soft_cap", "sm_scale"],
-        ["double", "double"],
+        [
+            "maybe_prefix_len_ptr",
+            "maybe_token_pos_in_items_ptr",
+            "maybe_max_item_len_ptr",
+        ],
+        ["uint32_t", "uint16_t", "uint16_t"],
+        ["logits_soft_cap", "sm_scale", "token_pos_in_items_len"],
+        ["double", "double", "int64_t"],
         is_sm90_template=True,
     )
 

diff --git a/aot_build_utils/literal_map.py b/aot_build_utils/literal_map.py
@@ -18,6 +18,7 @@
     0: "MaskMode::kNone",
     1: "MaskMode::kCausal",
     2: "MaskMode::kCustom",
+    3: "MaskMode::kMultiItemScoring",
 }
 
 pos_encoding_mode_literal = {

diff --git a/csrc/batch_prefill_sm90.cu b/csrc/batch_prefill_sm90.cu
@@ -141,6 +141,8 @@ void BatchPrefillWithRaggedKVCacheSM90Run(at::Tensor float_workspace_buffer,
             GetPtrFromBaseOffset<IdType>(int_buffer_ptr, plan_info.head_indices_offset);
         params.work_indptr =
             GetPtrFromBaseOffset<IdType>(int_buffer_ptr, plan_info.work_indptr_offset);
+        params.batch_indices =
+            GetPtrFromBaseOffset<IdType>(int_buffer_ptr, plan_info.batch_indices_offset);
 
         ADDITIONAL_PARAMS_SETTER
 
@@ -238,6 +240,8 @@ void BatchPrefillWithPagedKVCacheSM90Run(
             GetPtrFromBaseOffset<IdType>(int_buffer_ptr, plan_info.head_indices_offset);
         params.work_indptr =
             GetPtrFromBaseOffset<IdType>(int_buffer_ptr, plan_info.work_indptr_offset);
+        params.batch_indices =
+            GetPtrFromBaseOffset<IdType>(int_buffer_ptr, plan_info.batch_indices_offset);
         params.kv_indices = static_cast<IdType*>(paged_kv_indices.data_ptr());
 
         ADDITIONAL_PARAMS_SETTER

diff --git a/csrc/batch_prefill_sm90_customize_config.jinja b/csrc/batch_prefill_sm90_customize_config.jinja
@@ -43,6 +43,7 @@ struct RaggedParams {
   IdType* kv_lens;
   IdType* head_indices;
   IdType* work_indptr;
+  IdType* batch_indices;
 
   struct AdditionalParams {
     {{ additional_params_decl }}
@@ -88,6 +89,7 @@ struct PagedParams {
   IdType* kv_lens;
   IdType* head_indices;
   IdType* work_indptr;
+  IdType* batch_indices;
 
   struct AdditionalParams {
     {{ additional_params_decl }}

diff --git a/flashinfer/decode.py b/flashinfer/decode.py
@@ -1170,13 +1170,17 @@ def run(
                     None,  # packed_custom_mask
                     None,  # mask_indptr_buf
                     _get_cache_alibi_slopes_buf(q.shape[1], q.device),
+                    None,  # maybe_prefix_len_ptr
+                    None,  # maybe_token_pos_in_items_ptr
+                    None,  # maybe_max_item_len_ptr
                     logits_soft_cap,
                     sm_scale,
                     None,  # scale_q, not supported yet
                     None,  # scale_k
                     None,  # scale_v
                     rope_scale,
                     rope_theta,
+                    0,  # token_pos_in_items_len
                 ]
 
             self._cached_module.paged_run(*run_args)

diff --git a/flashinfer/jit/attention/pytorch.py b/flashinfer/jit/attention/pytorch.py
@@ -645,8 +645,8 @@ def gen_customize_pod_module(
 
     source_paths = []
 
-    for mask_mode_p in [0, 1, 2]:
-        for mask_mode_d in [0, 1, 2]:
+    for mask_mode_p in [0, 1, 2, 3]:
+        for mask_mode_d in [0, 1, 2, 3]:
             kwargs["mask_mode_p"] = mask_mode_literal[mask_mode_p]
             kwargs["mask_mode_d"] = mask_mode_literal[mask_mode_d]
 
@@ -759,27 +759,42 @@ def gen_batch_prefill_module(
             "maybe_custom_mask",
             "maybe_mask_indptr",
             "maybe_alibi_slopes",
+            "maybe_prefix_len_ptr",
+            "maybe_token_pos_in_items_ptr",
+            "maybe_max_item_len_ptr",
         ]
         additional_tensor_dtypes = [
             "uint8_t",
             "int32_t",
             "float",
+            "uint32_t",
+            "uint16_t",
+            "uint16_t",
         ]  # NOTE(Zihao): int32_t should follow dtype_idx
         additional_scalar_names = [
             "logits_soft_cap",
             "sm_scale",
             "rope_rcp_scale",
             "rope_rcp_theta",
+            "token_pos_in_items_len",
         ]
-        additional_scalar_dtypes = ["double", "double", "double", "double"]
+        additional_scalar_dtypes = ["double", "double", "double", "double", "int64_t"]
         variant_name = f"DefaultAttention<use_custom_mask, {str(use_sliding_window).lower()}, {str(use_logits_soft_cap).lower()}, {str(pos_encoding_mode == 2).lower()}>"
-        variant_decl = f"#include<flashinfer/attention/variants.cuh>"
+        variant_decl = "#include<flashinfer/attention/variants.cuh>"
     else:
         if not fp8_enabled:
-            additional_tensor_names = []
-            additional_tensor_dtypes = []
-            additional_scalar_names = ["logits_soft_cap", "sm_scale"]
-            additional_scalar_dtypes = ["double", "double"]
+            additional_tensor_names = [
+                "maybe_prefix_len_ptr",
+                "maybe_token_pos_in_items_ptr",
+                "maybe_max_item_len_ptr",
+            ]
+            additional_tensor_dtypes = ["uint32_t", "uint16_t", "uint16_t"]
+            additional_scalar_names = [
+                "logits_soft_cap",
+                "sm_scale",
+                "token_pos_in_items_len",
+            ]
+            additional_scalar_dtypes = ["double", "double", "int64_t"]
             variant_name = f"DefaultAttention<{str(use_logits_soft_cap).lower()}>"
             variant_decl = f"#include<flashinfer/attention/hopper/variants.cuh>"
         else:
@@ -961,7 +976,7 @@ def gen_customize_single_prefill_module(
         os.makedirs(gen_directory, exist_ok=True)
 
         source_paths = []
-        for mask_mode in [0, 1, 2]:
+        for mask_mode in [0, 1, 2, 3]:
             filename = f"single_prefill_kernel_mask_{mask_mode}.cu"
             dest_path = gen_directory / filename
             source_paths.append(dest_path)
@@ -1025,7 +1040,7 @@ def gen_customize_single_prefill_module(
         os.makedirs(gen_directory, exist_ok=True)
 
         source_paths = []
-        for mask_mode in [0, 1, 2]:
+        for mask_mode in [0, 1, 2, 3]:
             filename = f"single_prefill_sm90_kernel_mask_{mask_mode}.cu"
             dest_path = gen_directory / filename
             source_paths.append(dest_path)
@@ -1209,7 +1224,7 @@ def gen_customize_batch_prefill_module(
         os.makedirs(gen_directory, exist_ok=True)
 
         source_paths = []
-        for mask_mode in [0, 1, 2]:
+        for mask_mode in [0, 1, 2, 3]:
             dest_path = (
                 gen_directory / f"batch_prefill_paged_kernel_mask_{mask_mode}.cu"
             )
@@ -1286,7 +1301,7 @@ def gen_customize_batch_prefill_module(
         generated_inc_str = config_templ.render(**kwargs)
 
         source_paths = []
-        for mask_mode in [0, 1, 2]:
+        for mask_mode in [0, 1, 2, 3]:
             filename = f"batch_prefill_paged_sm90_kernel_mask_{mask_mode}.cu"
             dest_path = gen_directory / filename
             source_paths.append(dest_path)

diff --git a/flashinfer/jit/utils.py b/flashinfer/jit/utils.py
@@ -98,4 +98,5 @@ def wrapper(func, args):
     0: "MaskMode::kNone",
     1: "MaskMode::kCausal",
     2: "MaskMode::kCustom",
+    3: "MaskMode::kMultiItemScoring",
 }