flashinfer-ai
diff --git a/‎flashinfer/decode.py
+58-18 b/‎flashinfer/decode.py
+58-18
diff --git a/‎flashinfer/mla.py
+32-10 b/‎flashinfer/mla.py
+32-10
@@ -45,6 +45,7 @@
     _check_cached_qkv_data_type,
     _check_kv_layout,
     _check_pos_encoding_mode,
+    _check_shape_dtype_device,
     _get_cache_alibi_slopes_buf,
     _get_cache_buf,
     _get_range_buf,
@@ -972,6 +973,8 @@ def run(
         q_scale: Optional[float] = None,
         k_scale: Optional[float] = None,
         v_scale: Optional[float] = None,
+        out: Optional[torch.Tensor] = None,
+        lse: Optional[torch.Tensor] = None,
         return_lse: Literal[False] = False,
     ) -> torch.Tensor: ...
 
@@ -984,6 +987,8 @@ def run(
         q_scale: Optional[float] = None,
         k_scale: Optional[float] = None,
         v_scale: Optional[float] = None,
+        out: Optional[torch.Tensor] = None,
+        lse: Optional[torch.Tensor] = None,
         return_lse: Literal[True] = True,
     ) -> Tuple[torch.Tensor, torch.Tensor]: ...
 
@@ -995,6 +1000,8 @@ def run(
         q_scale: Optional[float] = None,
         k_scale: Optional[float] = None,
         v_scale: Optional[float] = None,
+        out: Optional[torch.Tensor] = None,
+        lse: Optional[torch.Tensor] = None,
         return_lse: bool = False,
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
         r"""Compute batch decode attention between query and paged kv cache.
@@ -1016,13 +1023,18 @@ def run(
               ``[max_num_pages, 2, num_kv_heads, page_size, head_dim]`` if
               :attr:`kv_layout` is ``HND``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
               ``paged_kv_cache[:, 1]`` is the value-cache.
-
+        *args
+            Additional arguments for the custom kernel.
         q_scale : Optional[float]
             The calibration scale of query for fp8 input, if not provided, will be set to ``1.0``.
         k_scale : Optional[float]
             The calibration scale of key for fp8 input, if not provided, will be set to ``1.0``.
         v_scale : Optional[float]
             The calibration scale of value for fp8 input, if not provided, will be set to ``1.0``.
+        out : Optional[torch.Tensor]
+            The output tensor, if not provided, will be allocated internally.
+        lse : Optional[torch.Tensor]
+            The log-sum-exp of attention logits, if not provided, will be allocated internally.
         return_lse : bool
             Whether to return the logsumexp of attention scores, defaults to ``False``.
 
@@ -1061,13 +1073,21 @@ def run(
         if rope_theta is None:
             rope_theta = 1e4
 
-        lse = None
         if return_lse:
-            lse = torch.empty(
-                (q.size(0), q.size(1)), dtype=torch.float32, device=q.device
-            )
+            if lse is None:
+                lse = torch.empty(
+                    (q.size(0), q.size(1)), dtype=torch.float32, device=q.device
+                )
+            else:
+                _check_shape_dtype_device(
+                    lse, (q.size(0), q.size(1)), torch.float32, q.device, "lse"
+                )
+
+        if out is None:
+            out = torch.empty_like(q)
+        else:
+            _check_shape_dtype_device(out, q.shape, q.dtype, q.device, "out")
 
-        out = torch.empty_like(q)
         if self.use_tensor_cores:
             run_args = [
                 self._float_workspace_buffer,
@@ -1270,11 +1290,11 @@ def __init__(
             Whether to enable CUDAGraph for batch decode attention, if enabled, the
             auxiliary data structures will be stored as the provided buffers. The ``batch_size``
             cannot change during the lifecycle of this wrapper when CUDAGraph is enabled.
-        
+
         use_tensor_cores : bool
             Whether to use tensor cores for the computation. Will be faster for large group
             size in grouped query attention. Defaults to ``False``.
-        
+
         paged_kv_indptr_buffer : Optional[torch.Tensor]
             The user reserved buffer on GPU to store the indptr of the paged kv cache, the size
             of the buffer should be ``[batch_size + 1]``.
@@ -1488,6 +1508,8 @@ def run(
         q_scale: Optional[float] = None,
         k_scale: Optional[float] = None,
         v_scale: Optional[float] = None,
+        out: Optional[torch.Tensor] = None,
+        lse: Optional[torch.Tensor] = None,
         return_lse: bool = False,
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
         r"""Compute batch decode attention between query and paged kv cache.
@@ -1510,6 +1532,10 @@ def run(
             The calibration scale of key for fp8 input, if not provided, will be set to ``1.0``.
         v_scale : Optional[float]
             The calibration scale of value for fp8 input, if not provided, will be set to ``1.0``.
+        out : Optional[torch.Tensor]
+            The output tensor, if not provided, will be allocated internally.
+        lse : Optional[torch.Tensor]
+            The log-sum-exp of attention logits, if not provided, will be allocated internally.
         return_lse : bool
             Whether to return the logsumexp of attention scores, defaults to ``False``.
 
@@ -1539,14 +1565,28 @@ def run(
             rope_theta = 1e4
 
         with self.device as device:
-            o = torch.empty_like(q_nope, device=device)
-            maybe_lse = (
-                torch.empty(
-                    (q_nope.size(0), q_nope.size(1)), dtype=torch.float32, device=device
+            if out is None:
+                out = torch.empty_like(q_nope, device=device)
+            else:
+                _check_shape_dtype_device(
+                    out, q_nope.shape, q_nope.dtype, q_nope.device, "out"
                 )
-                if return_lse
-                else None
-            )
+
+            if return_lse:
+                if lse is None:
+                    lse = torch.empty(
+                        (q_nope.size(0), q_nope.size(1)),
+                        dtype=torch.float32,
+                        device=device,
+                    )
+                else:
+                    _check_shape_dtype_device(
+                        lse,
+                        (q_nope.size(0), q_nope.size(1)),
+                        q_nope.dtype,
+                        q_nope.device,
+                        "lse",
+                    )
             self._cached_module.run(
                 self._float_workspace_buffer,
                 self._int_workspace_buffer,
@@ -1558,16 +1598,16 @@ def run(
                 self._paged_kv_indptr_buf,
                 self._paged_kv_indices_buf,
                 self._paged_kv_last_page_len_buf,
-                o,
+                out,
                 sm_scale,
                 window_left,
                 logits_soft_cap,
                 rope_scale,
                 rope_theta,
-                maybe_lse,
+                lse,
                 get_cuda_stream(device),
             )
-            out = [o, maybe_lse] if return_lse else [o]
+            out = [out, lse] if return_lse else [out]
         if v_scale is not None:
             out[0] *= v_scale
 
 
@@ -21,7 +21,13 @@
 import torch
 
 from .jit import gen_batch_mla_module, get_batch_mla_uri
-from .utils import MaskMode, get_cuda_stream, register_custom_op, register_fake_op
+from .utils import (
+    MaskMode,
+    _check_shape_dtype_device,
+    get_cuda_stream,
+    register_custom_op,
+    register_fake_op,
+)
 
 _batch_mla_modules = {}
 
@@ -267,6 +273,8 @@ def run(
         q_pe: torch.Tensor,
         ckv_cache: torch.Tensor,
         kpe_cache: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        lse: Optional[torch.Tensor] = None,
         return_lse: bool = False,
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
         r"""Run the MLA attention computation.
@@ -283,6 +291,10 @@ def run(
         kpe_cache : torch.Tensor
             The rope part of the kv-cache tensor, shape: ``[num_pages, page_size, head_dim_kpe]``.
             ``head_dim_kpe`` is 64 in DeepSeek v2/v3 models.
+        out : Optional[torch.Tensor]
+            The output tensor, if not provided, will be allocated internally.
+        lse : Optional[torch.Tensor]
+            The log-sum-exp of attention logits, if not provided, will be allocated internally.
         return_lse : bool, optional
             Whether to return the log-sum-exp value, default is False.
         """
@@ -292,12 +304,22 @@ def run(
         causal = self._causal
         mask_mode = MaskMode.CAUSAL.value if causal else MaskMode.NON_CAUSAL.value
         with self.device as device:
-            o = torch.empty_like(q_nope)
-            maybe_lse = (
-                torch.empty(q_nope.shape[:2], dtype=torch.float32, device=device)
-                if return_lse
-                else None
-            )
+            if out is None:
+                out = torch.empty_like(q_nope)
+            else:
+                _check_shape_dtype_device(
+                    out, q_nope.shape, q_nope.dtype, q_nope.device, "out"
+                )
+
+            if return_lse:
+                if lse is None:
+                    lse = torch.empty(
+                        q_nope.shape[:2], dtype=torch.float32, device=device
+                    )
+                else:
+                    _check_shape_dtype_device(
+                        lse, q_nope.shape[:2], torch.float32, q_nope.device, "lse"
+                    )
             self._cached_module.run(
                 self._float_workspace_buffer,
                 self._int_workspace_buffer,
@@ -307,13 +329,13 @@ def run(
                 ckv_cache,
                 kpe_cache,
                 self._kv_indices_buf,
-                o,
-                maybe_lse,
+                out,
+                lse,
                 mask_mode,
                 num_heads,
                 page_size,
                 sm_scale,
                 get_cuda_stream(device),
             )
 
-        return (o, maybe_lse) if return_lse else o
+        return (out, lse) if return_lse else out