flashinfer-ai
diff --git a/‎csrc/batch_decode_mla_config.jinja
+2 b/‎csrc/batch_decode_mla_config.jinja
+2
diff --git a/‎csrc/batch_decode_mla_cute_sm80.cu
+107 b/‎csrc/batch_decode_mla_cute_sm80.cu
+107
diff --git a/‎flashinfer/decode.py
+13-1 b/‎flashinfer/decode.py
+13-1
diff --git a/‎flashinfer/jit/attention.py
+43-12 b/‎flashinfer/jit/attention.py
+43-12
@@ -14,6 +14,8 @@ constexpr bool USE_LOGITS_SOFT_CAP = {{ use_logits_soft_cap }};
 constexpr int HEAD_DIM_CKV = {{ head_dim_ckv }};
 constexpr int HEAD_DIM_KPE = {{ head_dim_kpe }};
 
+constexpr int QO_TILE_LEN = {{ qo_tile_len }};
+
 using Params = BatchDecodeParamsMLA<DTypeQ, DTypeKV, DTypeO, IdType>;
 using AttentionVariant =
     DefaultAttention</*use_custom_mask=*/false, USE_SLIDING_WINDOW, USE_LOGITS_SOFT_CAP, /*use_alibi*/false>;
@@ -0,0 +1,107 @@
+#include <optional>
+
+#include "pytorch_extension_utils.h"
+
+#include "mla_config.inc"
+
+#include <flashinfer/attention/decode_mla_cute_sm80.cuh>
+#include <flashinfer/attention/scheduler.cuh>
+
+using namespace flashinfer;
+
+std::vector<int64_t> BatchDecodeWithPagedKVCachePlanMLA(
+    at::Tensor float_workspace_buffer, at::Tensor int_workspace_buffer,
+    at::Tensor page_locked_int_workspace_buffer, at::Tensor indptr, unsigned int batch_size,
+    unsigned int num_qo_heads, unsigned int page_size, bool enable_cuda_graph,
+    int64_t cuda_stream) {
+  size_t float_workspace_size_in_bytes =
+      float_workspace_buffer.size(0) * float_workspace_buffer.element_size();
+  size_t int_workspace_size_in_bytes =
+      int_workspace_buffer.size(0) * int_workspace_buffer.element_size();
+
+  DecodePlanInfo plan_info;
+  cudaStream_t stream = reinterpret_cast<cudaStream_t>(cuda_stream);
+
+  auto work_estimation_func =
+      BatchDecodeWithPagedKVCacheWorkEstimationDispatchedMlaCuteSM80<HEAD_DIM_CKV, HEAD_DIM_KPE, QO_TILE_LEN,
+                                                             AttentionVariant, Params>;
+  cudaError_t status =
+      DecodePlan<HEAD_DIM_CKV, flashinfer::PosEncodingMode::kNone, AttentionVariant, Params>(
+          static_cast<void*>(float_workspace_buffer.data_ptr()), float_workspace_size_in_bytes,
+          static_cast<void*>(int_workspace_buffer.data_ptr()),
+          static_cast<void*>(page_locked_int_workspace_buffer.data_ptr()),
+          int_workspace_size_in_bytes, plan_info, static_cast<IdType*>(indptr.data_ptr()),
+          batch_size, num_qo_heads, page_size, enable_cuda_graph, /*stream=*/stream,
+          work_estimation_func);
+
+  TORCH_CHECK(status == cudaSuccess, "BatchDecodeWithPagedKVCachePlanMLA failed with error ",
+              cudaGetErrorString(status));
+
+  return plan_info.ToVector();
+}
+
+
+void BatchDecodeWithPagedKVCacheRunMLA(
+    at::Tensor float_workspace_buffer, at::Tensor int_workspace_buffer,
+    std::vector<int64_t> plan_info_vec, at::Tensor q_nope, at::Tensor q_pe,
+    at::Tensor paged_ckv_cache, at::Tensor paged_kpe_cache, at::Tensor paged_kv_indptr,
+    at::Tensor paged_kv_indices, at::Tensor paged_kv_last_page_len, at::Tensor o, float sm_scale,
+    int window_left, float logits_soft_cap, float rope_scale, float rope_theta,
+    std::optional<at::Tensor> maybe_lse, int64_t cuda_stream) {
+  DecodePlanInfo plan_info;
+  plan_info.FromVector(plan_info_vec);
+
+  auto device = q_nope.device();
+  int64_t batch_size = q_nope.size(0);
+  int64_t num_qo_heads = q_nope.size(1);
+  int64_t page_size = paged_ckv_cache.size(1);
+
+  if (maybe_lse) {
+    const auto& lse = *maybe_lse;
+    TORCH_CHECK(lse.size(0) == batch_size, lse.size(0), q_nope.size(0));
+    TORCH_CHECK(lse.size(1) == num_qo_heads, lse.size(1), q_nope.size(1));
+  }
+
+  TORCH_CHECK(logits_soft_cap >= 0.f, "logits_soft_cap must be non-negative");
+
+  void* float_buffer = static_cast<void*>(float_workspace_buffer.data_ptr());
+  void* int_buffer = static_cast<void*>(int_workspace_buffer.data_ptr());
+
+  paged_kv_mla_t<DTypeKV, IdType> paged_kv(
+      page_size, HEAD_DIM_CKV, HEAD_DIM_KPE, batch_size,
+      static_cast<DTypeKV*>(paged_ckv_cache.data_ptr()), paged_ckv_cache.strides().data(),
+      static_cast<DTypeKV*>(paged_kpe_cache.data_ptr()), paged_kpe_cache.strides().data(),
+      static_cast<IdType*>(paged_kv_indices.data_ptr()),
+      static_cast<IdType*>(paged_kv_indptr.data_ptr()),
+      static_cast<IdType*>(paged_kv_last_page_len.data_ptr()));
+  Params params(static_cast<DTypeQ*>(q_nope.data_ptr()), static_cast<DTypeQ*>(q_pe.data_ptr()),
+                /*q_offset=*/nullptr, paged_kv, static_cast<DTypeO*>(o.data_ptr()),
+                /*lse=*/(maybe_lse ? static_cast<float*>(maybe_lse->data_ptr()) : nullptr),
+                num_qo_heads, window_left, logits_soft_cap, sm_scale, rope_scale, rope_theta);
+
+  DTypeO* tmp_v = nullptr;
+  float* tmp_s = nullptr;
+  params.request_indices =
+      GetPtrFromBaseOffset<IdType>(int_buffer, plan_info.request_indices_offset);
+  params.kv_tile_indices =
+      GetPtrFromBaseOffset<IdType>(int_buffer, plan_info.kv_tile_indices_offset);
+  params.o_indptr = GetPtrFromBaseOffset<IdType>(int_buffer, plan_info.o_indptr_offset);
+  params.kv_chunk_size_ptr =
+      GetPtrFromBaseOffset<IdType>(int_buffer, plan_info.kv_chunk_size_ptr_offset);
+  if (plan_info.split_kv) {
+    tmp_v = GetPtrFromBaseOffset<DTypeO>(float_buffer, plan_info.v_offset);
+    tmp_s = GetPtrFromBaseOffset<float>(float_buffer, plan_info.s_offset);
+    if (plan_info.enable_cuda_graph) {
+      params.block_valid_mask =
+          GetPtrFromBaseOffset<bool>(int_buffer, plan_info.block_valid_mask_offset);
+    }
+  }
+  params.padded_batch_size = plan_info.padded_batch_size;
+
+  cudaStream_t stream = reinterpret_cast<cudaStream_t>(cuda_stream);
+  cudaError_t status =
+      BatchDecodeWithPagedKVCacheDispatchedMlaCuteSM80<HEAD_DIM_CKV, HEAD_DIM_KPE, QO_TILE_LEN, 
+                                               Params>(params, tmp_v, tmp_s, /*stream=*/stream);
+  TORCH_CHECK(status == cudaSuccess, "BatchDecodeWithPagedKVCache failed with error ",
+              cudaGetErrorString(status));
+}
@@ -1252,6 +1252,7 @@ def __init__(
         self,
         float_workspace_buffer: torch.Tensor,
         use_cuda_graph: bool = False,
+        use_tensor_cores: bool = False,
         paged_kv_indptr_buffer: Optional[torch.Tensor] = None,
         paged_kv_indices_buffer: Optional[torch.Tensor] = None,
         paged_kv_last_page_len_buffer: Optional[torch.Tensor] = None,
@@ -1269,7 +1270,11 @@ def __init__(
             Whether to enable CUDAGraph for batch decode attention, if enabled, the
             auxiliary data structures will be stored as the provided buffers. The ``batch_size``
             cannot change during the lifecycle of this wrapper when CUDAGraph is enabled.
-
+        
+        use_tensor_cores : bool
+            Whether to use tensor cores for the computation. Will be faster for large group
+            size in grouped query attention. Defaults to ``False``.
+        
         paged_kv_indptr_buffer : Optional[torch.Tensor]
             The user reserved buffer on GPU to store the indptr of the paged kv cache, the size
             of the buffer should be ``[batch_size + 1]``.
@@ -1319,6 +1324,7 @@ def __init__(
         else:
             self._fixed_batch_size = 0
 
+        self._use_tensor_cores = use_tensor_cores
         self._paged_kv_indptr_buf = paged_kv_indptr_buffer
         self._paged_kv_indices_buf = paged_kv_indices_buffer
         self._paged_kv_last_page_len_buf = paged_kv_last_page_len_buffer
@@ -1328,6 +1334,10 @@ def __init__(
     def is_cuda_graph_enabled(self) -> bool:
         return self._use_cuda_graph
 
+    @property
+    def use_tensor_cores(self) -> bool:
+        return self._use_tensor_cores
+
     def reset_workspace_buffer(
         self, float_workspace_buffer: torch.Tensor, int_workspace_buffer: torch.Tensor
     ) -> None:
@@ -1445,8 +1455,10 @@ def plan(
             q_data_type,
             indptr.dtype,
             head_dim_compressed_kv,
+            num_qo_heads,
             window_left != -1,  # use_sliding_window
             logits_soft_cap > 0,  # use_logits_soft_cap
+            self._use_tensor_cores,
         )
         with self.device as device:
             self._plan_info = self._cached_module.plan(
 
@@ -22,7 +22,7 @@
 import jinja2
 import torch
 
-from .core import load_cuda_ops, sm90a_nvcc_flags
+from .core import logger, load_cuda_ops, sm90a_nvcc_flags
 from .env import FLASHINFER_CSRC_DIR, FLASHINFER_GEN_SRC_DIR
 from .utils import (
     dtype_map,
@@ -216,20 +216,20 @@ def get_batch_decode_mla_uri(
     dtype_kv: torch.dtype,
     dtype_o: torch.dtype,
     dtype_idx: torch.dtype,
-    head_dim_qk: int,
-    head_dim_vo: int,
+    head_dim_ckv: int,
     use_sliding_window: bool,
     use_logits_soft_cap: bool,
+    arc: str,
 ) -> str:
     return (
         f"batch_decode_mla_with_kv_cache_dtype_q_{filename_safe_dtype_map[dtype_q]}_"
         f"dtype_kv_{filename_safe_dtype_map[dtype_kv]}_"
         f"dtype_o_{filename_safe_dtype_map[dtype_o]}_"
         f"dtype_idx_{filename_safe_dtype_map[dtype_idx]}_"
-        f"head_dim_qk_{head_dim_qk}_"
-        f"head_dim_vo_{head_dim_vo}_"
+        f"head_dim_ckv{head_dim_ckv}_"
         f"use_swa_{use_sliding_window}_"
-        f"use_logits_cap_{use_logits_soft_cap}"
+        f"use_logits_cap_{use_logits_soft_cap}_"
+        f"arc_{arc}"
     )
 
 
@@ -239,18 +239,39 @@ def gen_batch_decode_mla_module(
     dtype_o: torch.dtype,
     dtype_idx: torch.dtype,
     head_dim: int,
+    num_qo_heads: int,
     use_sliding_window: bool,
     use_logits_soft_cap: bool,
+    use_tensor_cores: bool, 
 ):
+    cuda_arch_major = torch.cuda.get_device_properties(0).major
+    
+    if cuda_arch_major >= 9: # smem size of SM90 can accommodate all 128 qo-heads data
+        qo_tile_len = 128 
+    else:
+        qo_tile_len = 64
+    
+    if (
+            use_tensor_cores and
+            cuda_arch_major >= 8 and num_qo_heads % qo_tile_len == 0 and 
+            dtype_q == torch.float16 and dtype_kv == torch.float16 and 
+            dtype_o == torch.float16
+       ):
+        logger.info(f"Use tensor-core SM80 version of MLA decode kernel.")
+        arc = "sm80"
+    else: 
+        logger.info(f"Fall back to cuda-core version of MLA decode kernel.")
+        arc = "cuda_core"
+    
     uri = get_batch_decode_mla_uri(
         dtype_q,
         dtype_kv,
         dtype_o,
         dtype_idx,
         head_dim,
-        head_dim,
         use_sliding_window,
         use_logits_soft_cap,
+        arc,
     )
     gen_directory = FLASHINFER_GEN_SRC_DIR / uri
     os.makedirs(gen_directory, exist_ok=True)
@@ -267,17 +288,27 @@ def gen_batch_decode_mla_module(
             dtype_idx=dtype_map[dtype_idx],
             head_dim_ckv=head_dim,
             head_dim_kpe=head_dim // 8,
+            qo_tile_len=qo_tile_len,
             use_sliding_window=str(use_sliding_window).lower(),
             use_logits_soft_cap=str(use_logits_soft_cap).lower(),
         ),
     )
+    
+    filenames = []
+    if arc == "sm80":
+        filenames = [
+                        "batch_decode_mla_cute_sm80.cu",
+                        "batch_decode_mla_pybind.cu",
+                    ]
+    else: 
+        filenames = [
+                        "batch_decode_mla_plan.cu",
+                        "batch_decode_mla_run.cu",
+                        "batch_decode_mla_pybind.cu",
+                    ]
 
     source_paths = []
-    for filename in [
-        "batch_decode_mla_plan.cu",
-        "batch_decode_mla_run.cu",
-        "batch_decode_mla_pybind.cu",
-    ]:
+    for filename in filenames:
         src_path = FLASHINFER_CSRC_DIR / filename
         dest_path = gen_directory / filename
         source_paths.append(dest_path)