flashinfer-ai
diff --git a/‎.github/workflows/pre-commit.yml
Lines changed: 19 additions & 0 deletions b/‎.github/workflows/pre-commit.yml
Lines changed: 19 additions & 0 deletions
diff --git a/‎aot_build_utils/generate.py
Lines changed: 15 additions & 5 deletions b/‎aot_build_utils/generate.py
Lines changed: 15 additions & 5 deletions
diff --git a/‎aot_build_utils/generate_dispatch_inc.py
Lines changed: 5 additions & 1 deletion b/‎aot_build_utils/generate_dispatch_inc.py
Lines changed: 5 additions & 1 deletion
diff --git a/‎benchmarks/bench_fused_add_rmsnorm.py
Lines changed: 19 additions & 9 deletions b/‎benchmarks/bench_fused_add_rmsnorm.py
Lines changed: 19 additions & 9 deletions
diff --git a/‎benchmarks/bench_rope.py
Lines changed: 81 additions & 16 deletions b/‎benchmarks/bench_rope.py
Lines changed: 81 additions & 16 deletions
diff --git a/‎csrc/batch_decode.cu
Lines changed: 9 additions & 9 deletions b/‎csrc/batch_decode.cu
Lines changed: 9 additions & 9 deletions
diff --git a/‎csrc/batch_decode_jit_pybind.cu
Lines changed: 8 additions & 8 deletions b/‎csrc/batch_decode_jit_pybind.cu
Lines changed: 8 additions & 8 deletions
@@ -0,0 +1,19 @@
+name: pre-commit
+
+on:
+  pull_request:
+  push:
+    branches: [main]
+
+permissions: read-all
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    steps:
+    - uses: actions/[email protected]
+    - uses: actions/setup-python@v5
+      with:
+        python-version: '3.11'
+    - uses: pre-commit/[email protected]
@@ -275,7 +275,9 @@ def write_if_different(path: Path, content: str) -> None:
     )
     parser.add_argument(
         "--use_fp16_qk_reductions",
-        type=lambda x: x if isinstance(x, int) else int(x.lower() == "true" or x.lower() == "on"),
+        type=lambda x: (
+            x if isinstance(x, int) else int(x.lower() == "true" or x.lower() == "on")
+        ),
         required=True,
         nargs="+",
         help="Allow fp16 qk reductions",
@@ -289,28 +291,36 @@ def write_if_different(path: Path, content: str) -> None:
     )
     parser.add_argument(
         "--enable_f16",
-        type=lambda x: x if isinstance(x, int) else (x.lower() == "true" or x.lower() == "on"),
+        type=lambda x: (
+            x if isinstance(x, int) else (x.lower() == "true" or x.lower() == "on")
+        ),
         required=True,
         nargs="?",
         help="Enable fp16",
     )
     parser.add_argument(
         "--enable_bf16",
-        type=lambda x: x if isinstance(x, int) else (x.lower() == "true" or x.lower() == "on"),
+        type=lambda x: (
+            x if isinstance(x, int) else (x.lower() == "true" or x.lower() == "on")
+        ),
         required=True,
         nargs="?",
         help="Enable bf16",
     )
     parser.add_argument(
         "--enable_fp8_e4m3",
-        type=lambda x: x if isinstance(x, int) else (x.lower() == "true" or x.lower() == "on"),
+        type=lambda x: (
+            x if isinstance(x, int) else (x.lower() == "true" or x.lower() == "on")
+        ),
         default=True,
         nargs="?",
         help="Enable fp8_e4m3",
     )
     parser.add_argument(
         "--enable_fp8_e5m2",
-        type=lambda x: x if isinstance(x, int) else (x.lower() == "true" or x.lower() == "on"),
+        type=lambda x: (
+            x if isinstance(x, int) else (x.lower() == "true" or x.lower() == "on")
+        ),
         default=True,
         nargs="?",
         help="Enable fp8_e5m2",
 
@@ -100,7 +100,11 @@ def get_dispatch_inc_str(args: argparse.Namespace) -> str:
         "--path", type=str, required=True, help="Path to the dispatch inc file"
     )
     parser.add_argument(
-        "--head_dims_sm90", type=str, required=True, nargs="+", help="Head dimensions in format of 'head_dim_qk,head_dim_vo'",
+        "--head_dims_sm90",
+        type=str,
+        required=True,
+        nargs="+",
+        help="Head dimensions in format of 'head_dim_qk,head_dim_vo'",
     )
     parser.add_argument(
         "--head_dims", type=int, required=True, nargs="+", help="Head dimensions"
 
@@ -6,12 +6,20 @@
 
 import flashinfer
 
+
 @torch.inference_mode()
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--batch-sizes", nargs='+', type=int, default=[1, 19, 99, 989])
-    parser.add_argument("--hidden-sizes", nargs='+', type=int, default=[111, 500, 1024, 3072, 4096, 8192])
-    parser.add_argument("--dtypes", nargs='+', choices=["float16", "bfloat16"], default=["float16"])
+    parser.add_argument("--batch-sizes", nargs="+", type=int, default=[1, 19, 99, 989])
+    parser.add_argument(
+        "--hidden-sizes",
+        nargs="+",
+        type=int,
+        default=[111, 500, 1024, 3072, 4096, 8192],
+    )
+    parser.add_argument(
+        "--dtypes", nargs="+", choices=["float16", "bfloat16"], default=["float16"]
+    )
     args = parser.parse_args()
 
     eps = 1e-6
@@ -27,18 +35,19 @@ def main():
                 residual = torch.randn_like(x)
                 weight = torch.randn(hidden_size, dtype=dtype, device="cuda")
 
-                @torch.cuda.nvtx.range(f"fused_add_rmsnorm batch_size={batch_size}, hidden_size={hidden_size}, dtype={dtype_str}")
+                @torch.cuda.nvtx.range(
+                    f"fused_add_rmsnorm batch_size={batch_size}, hidden_size={hidden_size}, dtype={dtype_str}"
+                )
                 def fn() -> None:
                     flashinfer.fused_add_rmsnorm(x, residual, weight, eps)
 
                 # Run benchmarking
                 latency_ms = cast(float, do_bench(fn))
                 throughput = (
-                    (x.numel() * x.element_size() * 2
-                     + residual.numel() * residual.element_size() * 2
-                     + weight.numel() * weight.element_size())
-                    / (latency_ms * 1e-3)
-                )
+                    x.numel() * x.element_size() * 2
+                    + residual.numel() * residual.element_size() * 2
+                    + weight.numel() * weight.element_size()
+                ) / (latency_ms * 1e-3)
                 print(
                     f"batch_size: {batch_size:3},",
                     f"hidden_size: {hidden_size:5},",
@@ -51,5 +60,6 @@ def fn() -> None:
 
     torch.cuda.profiler.stop()
 
+
 if __name__ == "__main__":
     main()
@@ -11,9 +11,13 @@
 
 import torch
 import torch.nn as nn
-from flashinfer.rope import apply_rope_with_cos_sin_cache_inplace
-from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding as vLLMRotaryEmbedding
 import triton
+from vllm.model_executor.layers.rotary_embedding import (
+    RotaryEmbedding as vLLMRotaryEmbedding,
+)
+
+from flashinfer.rope import apply_rope_with_cos_sin_cache_inplace
+
 
 class FlashInferRotaryEmbedding(nn.Module):
 
@@ -39,8 +43,12 @@ def __init__(
         self.register_buffer("cos_sin_cache", cache, persistent=False)
 
     def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
-        inv_freq = 1.0 / (base**(torch.arange(
-            0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim))
+        inv_freq = 1.0 / (
+            base
+            ** (
+                torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim
+            )
+        )
         return inv_freq
 
     def _compute_cos_sin_cache(self) -> torch.Tensor:
@@ -82,7 +90,7 @@ def _apply_rotary_emb(
             return torch.cat((o1, o2), dim=-1)
         else:
             return torch.stack((o1, o2), dim=-1).flatten(-2)
-        
+
     def forward_cuda(
         self,
         positions: torch.Tensor,
@@ -100,42 +108,99 @@ def forward_cuda(
         )
         return query, key
 
+
 @triton.testing.perf_report(
     triton.testing.Benchmark(
         x_names=["seq_len"],
-        x_vals=[2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536],
+        x_vals=[
+            2,
+            4,
+            8,
+            16,
+            32,
+            64,
+            128,
+            256,
+            512,
+            1024,
+            2048,
+            4096,
+            8192,
+            16384,
+            32768,
+            65536,
+        ],
         line_arg="provider",
         line_vals=["flashinfer", "native", "vllm"],
         line_names=["FlashInfer", "Native", "vLLM"],
         styles=[("blue", "-"), ("red", "-"), ("green", "-")],
         ylabel="Latency (ms)",
         plot_name="rope-latency",
-        args={"head_size": 4096//32, "rotary_dim": 4096//32, "max_position_embeddings": 65536, "base": 500000, "is_neox_style": True, "dtype": torch.bfloat16, "device": "cuda", "batch_size": 2, "num_q_heads": 32, "num_kv_heads": 8},
+        args={
+            "head_size": 4096 // 32,
+            "rotary_dim": 4096 // 32,
+            "max_position_embeddings": 65536,
+            "base": 500000,
+            "is_neox_style": True,
+            "dtype": torch.bfloat16,
+            "device": "cuda",
+            "batch_size": 2,
+            "num_q_heads": 32,
+            "num_kv_heads": 8,
+        },
     )
 )
-def benchmark(provider, head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype, device, batch_size, seq_len, num_q_heads, num_kv_heads):
-    print(f"provider: {provider}, head_size: {head_size}, rotary_dim: {rotary_dim}, max_position_embeddings: {max_position_embeddings}, base: {base}, is_neox_style: {is_neox_style}, dtype: {dtype}, device: {device}, batch_size: {batch_size}, seq_len: {seq_len}, num_q_heads: {num_q_heads}, num_kv_heads: {num_kv_heads}")
-    
+def benchmark(
+    provider,
+    head_size,
+    rotary_dim,
+    max_position_embeddings,
+    base,
+    is_neox_style,
+    dtype,
+    device,
+    batch_size,
+    seq_len,
+    num_q_heads,
+    num_kv_heads,
+):
+    print(
+        f"provider: {provider}, head_size: {head_size}, rotary_dim: {rotary_dim}, max_position_embeddings: {max_position_embeddings}, base: {base}, is_neox_style: {is_neox_style}, dtype: {dtype}, device: {device}, batch_size: {batch_size}, seq_len: {seq_len}, num_q_heads: {num_q_heads}, num_kv_heads: {num_kv_heads}"
+    )
+
     rope_forward = None
 
     if provider == "vllm":
-        rope = vLLMRotaryEmbedding(head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype).to(device)
+        rope = vLLMRotaryEmbedding(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        ).to(device)
         rope_forward = rope.forward_cuda
     elif provider == "flashinfer":
-        rope = FlashInferRotaryEmbedding(head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype).to(device)
+        rope = FlashInferRotaryEmbedding(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        ).to(device)
         rope_forward = rope.forward_cuda
     elif provider == "native":
-        rope = vLLMRotaryEmbedding(head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype).to(device)
+        rope = vLLMRotaryEmbedding(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        ).to(device)
         rope_forward = rope.forward_native
 
     pos_ids = torch.arange(seq_len, device=device).repeat(batch_size)
-    query = torch.randn(batch_size * seq_len, num_q_heads * head_size, dtype=dtype, device=device)
-    key = torch.randn(batch_size * seq_len, num_kv_heads * head_size, dtype=dtype, device=device)
+    query = torch.randn(
+        batch_size * seq_len, num_q_heads * head_size, dtype=dtype, device=device
+    )
+    key = torch.randn(
+        batch_size * seq_len, num_kv_heads * head_size, dtype=dtype, device=device
+    )
 
     quantiles = [0.5, 0.2, 0.8]
-    ms, min_ms, max_ms = triton.testing.do_bench(lambda: rope_forward(pos_ids, query, key), quantiles=quantiles)
+    ms, min_ms, max_ms = triton.testing.do_bench(
+        lambda: rope_forward(pos_ids, query, key), quantiles=quantiles
+    )
 
     return ms, min_ms, max_ms
 
+
 if __name__ == "__main__":
     benchmark.run(print_data=True, show_plots=True, save_path="rope_benchmark.png")
@@ -19,8 +19,8 @@
 #include <optional>
 
 #include "batch_decode_config.inc"
-#include "pytorch_extension_utils.h"
 #include "pytorch_conversion_utils.h"
+#include "pytorch_extension_utils.h"
 
 namespace flashinfer {
 
@@ -36,9 +36,9 @@ using namespace flashinfer;
 at::Tensor BatchDecodeWithPagedKVCachePlan(
     at::Tensor float_workspace_buffer, at::Tensor int_workspace_buffer,
     at::Tensor page_locked_int_workspace_buffer, at::Tensor indptr, int64_t batch_size,
-    int64_t num_qo_heads, int64_t num_kv_heads, int64_t page_size,
-    bool enable_cuda_graph, int64_t window_left, double logits_soft_cap, int64_t head_dim_qk,
-    int64_t head_dim_vo, at::Tensor empty_q_data, at::Tensor empty_kv_data, int64_t cuda_stream) {
+    int64_t num_qo_heads, int64_t num_kv_heads, int64_t page_size, bool enable_cuda_graph,
+    int64_t window_left, double logits_soft_cap, int64_t head_dim_qk, int64_t head_dim_vo,
+    at::Tensor empty_q_data, at::Tensor empty_kv_data, int64_t cuda_stream) {
   size_t float_workspace_size_in_bytes =
       float_workspace_buffer.size(0) * float_workspace_buffer.element_size();
   size_t int_workspace_size_in_bytes =
@@ -78,11 +78,11 @@ at::Tensor BatchDecodeWithPagedKVCachePlan(
 }
 
 void BatchDecodeWithPagedKVCacheRun(
-    at::Tensor float_workspace_buffer, at::Tensor int_workspace_buffer,
-    at::Tensor plan_info_vec, at::Tensor q, at::Tensor paged_k_cache,
-    at::Tensor paged_v_cache, at::Tensor paged_kv_indptr, at::Tensor paged_kv_indices,
-    at::Tensor paged_kv_last_page_len, at::Tensor o, std::optional<at::Tensor> maybe_lse,
-    int64_t kv_layout_code, int64_t window_left ADDITIONAL_FUNC_PARAMS, int64_t cuda_stream) {
+    at::Tensor float_workspace_buffer, at::Tensor int_workspace_buffer, at::Tensor plan_info_vec,
+    at::Tensor q, at::Tensor paged_k_cache, at::Tensor paged_v_cache, at::Tensor paged_kv_indptr,
+    at::Tensor paged_kv_indices, at::Tensor paged_kv_last_page_len, at::Tensor o,
+    std::optional<at::Tensor> maybe_lse, int64_t kv_layout_code,
+    int64_t window_left ADDITIONAL_FUNC_PARAMS, int64_t cuda_stream) {
   DecodePlanInfo plan_info;
   plan_info.FromVector(tensor_to_vec(plan_info_vec));
   QKVLayout kv_layout = static_cast<QKVLayout>(kv_layout_code);
 
@@ -19,16 +19,16 @@
 at::Tensor BatchDecodeWithPagedKVCachePlan(
     at::Tensor float_workspace_buffer, at::Tensor int_workspace_buffer,
     at::Tensor page_locked_int_workspace_buffer, at::Tensor indptr, int64_t batch_size,
-    int64_t num_qo_heads, int64_t num_kv_heads, int64_t page_size,
-    bool enable_cuda_graph, int64_t window_left, double logits_soft_cap, int64_t head_dim_qk,
-    int64_t head_dim_vo, at::Tensor empty_q_data, at::Tensor empty_kv_data, int64_t cuda_stream);
+    int64_t num_qo_heads, int64_t num_kv_heads, int64_t page_size, bool enable_cuda_graph,
+    int64_t window_left, double logits_soft_cap, int64_t head_dim_qk, int64_t head_dim_vo,
+    at::Tensor empty_q_data, at::Tensor empty_kv_data, int64_t cuda_stream);
 
 void BatchDecodeWithPagedKVCacheRun(
-    at::Tensor float_workspace_buffer, at::Tensor int_workspace_buffer,
-    at::Tensor plan_info_vec, at::Tensor q, at::Tensor paged_k_cache,
-    at::Tensor paged_v_cache, at::Tensor paged_kv_indptr, at::Tensor paged_kv_indices,
-    at::Tensor paged_kv_last_page_len, at::Tensor o, std::optional<at::Tensor> maybe_lse,
-    int64_t kv_layout_code, int64_t window_left ADDITIONAL_FUNC_PARAMS, int64_t cuda_stream);
+    at::Tensor float_workspace_buffer, at::Tensor int_workspace_buffer, at::Tensor plan_info_vec,
+    at::Tensor q, at::Tensor paged_k_cache, at::Tensor paged_v_cache, at::Tensor paged_kv_indptr,
+    at::Tensor paged_kv_indices, at::Tensor paged_kv_last_page_len, at::Tensor o,
+    std::optional<at::Tensor> maybe_lse, int64_t kv_layout_code,
+    int64_t window_left ADDITIONAL_FUNC_PARAMS, int64_t cuda_stream);
 
 TORCH_LIBRARY_FRAGMENT(TORCH_EXTENSION_NAME, m) {
   // Batched decode with paged KV-Cache plan
Original file line number	Diff line number	Diff line change
`@@ -100,7 +100,11 @@ def get_dispatch_inc_str(args: argparse.Namespace) -> str:`
`100`	`100`	`"--path", type=str, required=True, help="Path to the dispatch inc file"`
`101`	`101`	`)`
`102`	`102`	`parser.add_argument(`
`103`		`- "--head_dims_sm90", type=str, required=True, nargs="+", help="Head dimensions in format of 'head_dim_qk,head_dim_vo'",`
	`103`	`+ "--head_dims_sm90",`
	`104`	`+ type=str,`
	`105`	`+ required=True,`
	`106`	`+ nargs="+",`
	`107`	`+ help="Head dimensions in format of 'head_dim_qk,head_dim_vo'",`
`104`	`108`	`)`
`105`	`109`	`parser.add_argument(`
`106`	`110`	`"--head_dims", type=int, required=True, nargs="+", help="Head dimensions"`