[NVIDIA] Add Cutlass MLA backend (#1031)

kaixih · web-flow · commit 26ebac7a4348 · 2025-04-23T09:54:31.000-07:00
This PR add a `cutlass` backend to the flashinfer `BatchMLAPagedAttentionWrapper`. cc @yzh119 @kushanam
diff --git a/csrc/cutlass_mla.cu b/csrc/cutlass_mla.cu
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2024 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <flashinfer/attention/cutlass_mla.cuh>
+
+#include "pytorch_extension_utils.h"
+
+using namespace flashinfer;
+using namespace flashinfer::attention;
+
+void CutlassMLAPagedAttention(at::Tensor workspace, at::Tensor out, at::Tensor lse,
+                              at::Tensor q_nope_pe, at::Tensor ckv_kpe_cache, at::Tensor kv_lens,
+                              at::Tensor page_table) {
+  const c10::cuda::OptionalCUDAGuard device_guard(q_nope_pe.device());
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  int device_index = q_nope_pe.device().index();
+  int batches = q_nope_pe.sizes()[0];
+  int page_count_per_seq = page_table.sizes()[1];
+  int page_count_total = ckv_kpe_cache.sizes()[0];
+  int page_size = ckv_kpe_cache.sizes()[1];
+
+  DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FP16(q_nope_pe.scalar_type(), c_type, [&] {
+    using cutlass_t = cutlass_dtype_t<c_type>;
+    auto status = runMla<cutlass_t>(
+        workspace.data_ptr(), out.data_ptr(), lse.data_ptr(), q_nope_pe.data_ptr(),
+        ckv_kpe_cache.data_ptr(), kv_lens.data_ptr(), page_table.data_ptr(), batches,
+        page_count_per_seq, page_count_total, page_size, device_index, stream);
+    TORCH_CHECK(status == cudaSuccess,
+                "Failed to run CutlassMLAPagedAttention: ", cudaGetErrorString(status));
+    return true;
+  });
+}
diff --git a/csrc/flashinfer_mla_ops.cu b/csrc/flashinfer_mla_ops.cu
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2023 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "pytorch_extension_utils.h"
+
+void CutlassMLAPagedAttention(at::Tensor workspace, at::Tensor out, at::Tensor lse,
+                              at::Tensor q_nope_pe, at::Tensor ckv_kpe_cache, at::Tensor kv_lens,
+                              at::Tensor page_table);
+
+TORCH_LIBRARY_FRAGMENT(TORCH_EXTENSION_NAME, m) {
+  // "Cutlass MLA Paged Attention"
+  m.def("cutlass_mla_paged_attention", CutlassMLAPagedAttention);
+}
diff --git a/flashinfer/mla.py b/flashinfer/mla.py
@@ -20,7 +20,8 @@
 
 import torch
 
-from .jit import gen_batch_mla_module
+from .jit import FLASHINFER_CSRC_DIR, gen_batch_mla_module, load_cuda_ops
+from .jit.env import CUTLASS_INCLUDE_DIRS as CUTLASS_INCLUDE_DIRS
 from .utils import (
     MaskMode,
     _check_shape_dtype_device,
@@ -29,6 +30,57 @@
     register_fake_op,
 )
 
+
+def _check_cutlass_shape(q_nope_pe, ckv_kpe_cache, kv_len, page_table):
+    if q_nope_pe.ndim != 3:
+        raise ValueError(f"Expected q_nope_pe.ndim == 3, got {q_nope_pe.ndim}")
+    if ckv_kpe_cache.ndim != 3:
+        raise ValueError(f"Expected ckv_kpe_cache.ndim == 3, got {ckv_kpe_cache.ndim}")
+    if kv_len.ndim != 1:
+        raise ValueError(f"Expected kv_len.ndim == 1, got {kv_len.ndim}")
+    if page_table.ndim != 2:
+        raise ValueError(f"Expected page_table.ndim == 2, got {page_table.ndim}")
+    B_q, H, D_q = q_nope_pe.shape
+    D_ckv = ckv_kpe_cache.shape[2]
+    if H != 128:
+        raise ValueError(f"Expected 128 heads for q_nope_pe, got {H}")
+    if D_q != D_ckv or D_q != 576:
+        raise ValueError(
+            f"Expected head dim 576 for q_nope_pe and ckv_kpe_cache, got {D_q} and {D_ckv}"
+        )
+    B_block_table, block_num = page_table.shape
+    block_size = ckv_kpe_cache.shape[1]
+    if B_q != B_block_table:
+        raise ValueError(
+            f"Expected batch size {B_q} for q_nope_pe and block_table, got {B_q} and {B_block_table}"
+        )
+    if block_num % (128 / block_size) != 0:
+        raise ValueError(
+            f"Expected block_num % (128 / block_size) == 0, got {block_num=} and {block_size=}"
+        )
+
+
+_mla_module = None
+
+
+def get_mla_module():
+    global _mla_module
+    if _mla_module is None:
+        _mla_module = load_cuda_ops(
+            "mla",
+            [
+                FLASHINFER_CSRC_DIR / "cutlass_mla.cu",
+                FLASHINFER_CSRC_DIR / "flashinfer_mla_ops.cu",
+            ],
+            extra_include_paths=[
+                CUTLASS_INCLUDE_DIRS[0] / ".." / "examples" / "77_blackwell_fmha",
+                CUTLASS_INCLUDE_DIRS[0] / ".." / "examples" / "common",
+            ],
+            extra_cuda_cflags=["-gencode", "arch=compute_100a,code=sm_100a"],
+        )
+    return _mla_module
+
+
 _batch_mla_modules = {}
 _batch_mla_sm90_modules = {}
 
@@ -152,10 +204,17 @@ def __init__(
         backend : str
             The implementation backend, could be ``auto``/``fa2`` or ``fa3``. Defaults to ``auto``.
             If set to ``auto``, the function will automatically choose the backend based on the
-            device architecture and kernel availability.
+            device architecture and kernel availability. If ``cutlass`` is provided, the MLA
+            kernels will be generated by CUTLASS and only float_workspace_buffer is required and
+            other arguments are ignored.
         """
         self._float_workspace_buffer = float_workspace_buffer
         self.device = float_workspace_buffer.device
+
+        if backend == "cutlass":
+            self._backend = backend
+            return
+
         self._int_workspace_buffer = torch.empty(
             (8 * 1024 * 1024,), dtype=torch.uint8, device=self.device
         )
@@ -294,6 +353,8 @@ def run(
         lse: Optional[torch.Tensor] = None,
         return_lse: bool = False,
         profiler_buffer: Optional[torch.Tensor] = None,
+        kv_len: Optional[torch.Tensor] = None,
+        page_table: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
         r"""Run the MLA attention computation.
 
@@ -317,7 +378,40 @@ def run(
             Whether to return the log-sum-exp value, default is False.
         profiler_buffer : Optional[torch.Tensor]
             The buffer to store the profiler data.
+        kv_len : Optional[torch.Tensor]
+            The query length of each request, shape: ``[batch_size]``. Required when ``backend`` is ``cutlass``.
+        page_table : Optional[torch.Tensor]
+            The page table of the paged kv-cache, shape: ``[batch_size, num_pages]``. Required when ``backend`` is ``cutlass``.
         """
+        if self._backend == "cutlass":
+            if return_lse:
+                raise ValueError("return_lse does not support cutlass backend for now.")
+            if profiler_buffer is not None:
+                raise ValueError(
+                    "profiler_buffer does not support cutlass backend for now."
+                )
+            self._cached_module = get_mla_module()
+            if out is None:
+                out = torch.empty_like(q_nope)
+            else:
+                _check_shape_dtype_device(
+                    out, q_nope.shape, q_nope.dtype, q_nope.device, "out"
+                )
+            q_nope_pe = torch.cat([q_nope, q_pe], dim=-1)
+            ckv_kpe_cache = torch.cat([ckv_cache, kpe_cache], dim=-1)
+            _check_cutlass_shape(q_nope_pe, ckv_kpe_cache, kv_len, page_table)
+            lse = torch.empty(0, dtype=torch.float32, device=self.device)
+            self._cached_module.cutlass_mla_paged_attention.default(
+                self._float_workspace_buffer,
+                out,
+                lse,
+                q_nope_pe,
+                ckv_kpe_cache,
+                kv_len,
+                page_table,
+            )
+            return out
+
         if profiler_buffer is None:
             if self._use_profiler:
                 raise ValueError(
diff --git a/flashinfer/utils.py b/flashinfer/utils.py
@@ -359,6 +359,11 @@ def is_sm90a_supported(device: torch.device) -> bool:
     return major == 9 and torch.version.cuda >= "12.3"
 
 
+def is_sm100a_supported(device: torch.device) -> bool:
+    major, minor = get_compute_capability(device)
+    return major == 10 and minor == 0 and torch.version.cuda >= "12.9"
+
+
 def determine_mla_backend(device: torch.device) -> str:
     return "fa3" if is_sm90a_supported(device) else "fa2"
 
diff --git a/include/flashinfer/attention/cutlass_mla.cuh b/include/flashinfer/attention/cutlass_mla.cuh
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2024 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef FLASHINFER_ATTENTION_CUTLASS_MLA_CUH_
+#define FLASHINFER_ATTENTION_CUTLASS_MLA_CUH_
+#include <sstream>
+
+#include "../cutlass_utils.cuh"
+#include "../exception.h"
+#include "cutlass/kernel_hardware_info.h"
+
+// From 3rdparty/cutlass/examples/77_blackwell_fmha
+#include "device/sm100_mla.hpp"
+#include "kernel/sm100_mla_tile_scheduler.hpp"
+
+namespace flashinfer {
+
+namespace attention {
+
+using namespace cute;
+using namespace cutlass::fmha::kernel;
+
+#define CUTLASS_CHECK(cmd)                                                            \
+  do {                                                                                \
+    auto status = cmd;                                                                \
+    if (status != cutlass::Status::kSuccess) {                                        \
+      std::ostringstream err_msg;                                                     \
+      err_msg << "cutlass " << #cmd << " failed: " << cutlassGetStatusString(status); \
+      FLASHINFER_ERROR(err_msg.str());                                                \
+    }                                                                                 \
+  } while (0)
+
+template <bool v>
+struct IsPersistent {
+  static const bool value = v;
+};
+
+template <typename T, typename PersistenceOption = IsPersistent<true>>
+struct MlaSm100 {
+  using Element = T;
+  using ElementAcc = float;
+  using ElementOut = T;
+
+  using TileShape = Shape<_128, _128, Shape<_512, _64>>;
+  using TileShapeH = cute::tuple_element_t<0, TileShape>;
+  using TileShapeD = cute::tuple_element_t<2, TileShape>;
+
+  // H K (D_latent D_rope) B
+  using ProblemShape = cute::tuple<TileShapeH, int, TileShapeD, int>;
+
+  using StrideQ = cute::tuple<int64_t, _1, int64_t>;  // H D B
+  using StrideK = cute::tuple<int64_t, _1, int64_t>;  // K D B
+  using StrideO = StrideK;                            // H D B
+  using StrideLSE = cute::tuple<_1, int>;             // H B
+
+  using TileScheduler =
+      std::conditional_t<PersistenceOption::value, Sm100MlaPersistentTileScheduler,
+                         Sm100MlaIndividualTileScheduler>;
+
+  using FmhaKernel = cutlass::fmha::kernel::Sm100FmhaMlaKernelTmaWarpspecialized<
+      TileShape, Element, ElementAcc, ElementOut, ElementAcc, TileScheduler, /*kIsCpAsync=*/true>;
+  using Fmha = cutlass::fmha::device::MLA<FmhaKernel>;
+};
+
+template <typename T>
+typename T::Fmha::Arguments args_from_options(void* out_ptr, void* lse_ptr, void* q_absorbed_ptr,
+                                              void* ckv_kpe_cache_ptr, void* seq_lens_ptr,
+                                              void* page_table_ptr, int batches,
+                                              int page_count_per_seq, int page_count_total,
+                                              int page_size, int device_index) {
+  cutlass::KernelHardwareInfo hw_info;
+  hw_info.device_id = device_index;
+  hw_info.sm_count =
+      cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+  int max_seq_len = page_size * page_count_per_seq;
+  using TileShapeH = typename T::TileShapeH;
+  using TileShapeD = typename T::TileShapeD;
+  auto problem_shape = cute::make_tuple(TileShapeH{}, max_seq_len, TileShapeD{}, batches);
+
+  auto [H, K, D, B] = problem_shape;
+  auto [D_latent, D_rope] = D;
+
+  // the scale is based on the non-absorbed sizes, change as appropriate
+  // we can't determine this parameter from the info we have, it's an input
+  int D_non_latent = 128;
+  float scale = 1.0 / sqrt(1.0 * (D_non_latent + D_rope));
+
+  using StrideQ = typename T::StrideQ;
+  using StrideK = typename T::StrideK;
+  using StrideO = typename T::StrideO;
+  using StrideLSE = typename T::StrideLSE;
+
+  StrideQ stride_Q = cute::make_tuple(static_cast<int64_t>(0 + D_latent + D_rope), _1{},
+                                      static_cast<int64_t>(H * (0 + D_latent + D_rope)));
+  StrideK stride_C = cute::make_tuple(static_cast<int64_t>(0 + D_latent + D_rope), _1{},
+                                      static_cast<int64_t>(page_size * (D_latent + D_rope)));
+  StrideLSE stride_PT = cute::make_stride(_1{}, page_count_per_seq);
+  StrideLSE stride_LSE = cute::make_tuple(_1{}, 0 + H);
+  StrideO stride_O = cute::make_tuple(static_cast<int64_t>(0 + D_latent), _1{},
+                                      static_cast<int64_t>(0 + H * D_latent));
+
+  using Element = typename T::Element;
+  using ElementOut = typename T::ElementOut;
+  using ElementAcc = typename T::ElementAcc;
+  auto Q_ptr = reinterpret_cast<Element*>(q_absorbed_ptr);
+  auto C_ptr = reinterpret_cast<Element*>(ckv_kpe_cache_ptr);
+  typename T::Fmha::Arguments arguments{
+      problem_shape,
+      {scale, Q_ptr, stride_Q, Q_ptr + D_latent, stride_Q, C_ptr, stride_C, C_ptr + D_latent,
+       stride_C, reinterpret_cast<int*>(seq_lens_ptr), reinterpret_cast<int*>(page_table_ptr),
+       stride_PT, page_count_total, page_size},
+      {reinterpret_cast<ElementOut*>(out_ptr), stride_O,
+       // static_cast<ElementAcc*>(lse.data_ptr()), stride_LSE},
+       static_cast<ElementAcc*>(nullptr), stride_LSE},
+      hw_info,
+      -1,       // split_kv
+      nullptr,  // is_var_split_kv=false
+  };
+  // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
+  // split_kv automatically based on batch size and sequence length to balance
+  // workload across available SMs. Consider using var_split_kv for manual
+  // control if needed.
+  T::Fmha::set_split_kv(arguments);
+  return arguments;
+}
+
+template <typename Element>
+cudaError_t runMla(void* workspace_ptr, void* out_ptr, void* lse_ptr, void* q_absorbed_ptr,
+                   void* ckv_kpe_cache_ptr, void* seq_lens_ptr, void* page_table_ptr, int batches,
+                   int page_count_per_seq, int page_count_total, int page_size, int device_index,
+                   cudaStream_t stream) {
+  using MlaSm100Type = MlaSm100<Element>;
+  typename MlaSm100Type::Fmha fmha;
+  auto arguments = args_from_options<MlaSm100Type>(
+      out_ptr, lse_ptr, q_absorbed_ptr, ckv_kpe_cache_ptr, seq_lens_ptr, page_table_ptr, batches,
+      page_count_per_seq, page_count_total, page_size, device_index);
+
+  CUTLASS_CHECK(fmha.can_implement(arguments));
+
+  CUTLASS_CHECK(fmha.initialize(arguments, workspace_ptr, stream));
+
+  CUTLASS_CHECK(fmha.run(arguments, workspace_ptr, stream));
+
+  return cudaSuccess;
+}
+
+}  // namespace attention
+
+}  // namespace flashinfer
+#endif  // FLASHINFER_ATTENTION_CUTLASS_MLA_CUH_
diff --git a/tests/test_deepseek_mla.py b/tests/test_deepseek_mla.py