refactor: move triton dependency to flashinfer.triton (#918)

yzh119 · web-flow · commit d835e6ff5978 · 2025-03-06T18:11:39.000-05:00
Some platforms do not support triton but user still need other
functionlities (e.g. JIT) in flashinfer, this PR moves triton dependency
to flashinfer.triton and defer the import so that user can still use
flashinfer without installing triton.
diff --git a/flashinfer/gemm.py b/flashinfer/gemm.py
@@ -18,8 +18,6 @@
 from typing import Optional
 
 import torch
-import triton
-import triton.language as tl
 
 from .jit import FLASHINFER_CSRC_DIR, has_prebuilt_ops, load_cuda_ops
 from .utils import (
@@ -223,92 +221,6 @@ def _fake_cutlass_segment_gemm_sm90(
     return _gemm_module_sm90
 
 
-@triton.jit
-def compute_sm80_group_gemm_args(
-    all_problems_ptr,
-    x_ptr,
-    w_ptr,
-    y_ptr,
-    x_ld_ptr,
-    w_ld_ptr,
-    y_ld_ptr,
-    x,
-    w,
-    y,
-    xy_indptr,
-    w_indices,
-    d_in,
-    d_out,
-    w_column_major,
-):
-
-    pid = tl.program_id(0)
-
-    m = tl.load(xy_indptr + pid + 1) - tl.load(xy_indptr + pid)
-    k, n = d_in, d_out
-
-    tl.store(all_problems_ptr + pid * 3, m)
-    tl.store(all_problems_ptr + pid * 3 + 1, n)
-    tl.store(all_problems_ptr + pid * 3 + 2, k)
-
-    w_i = tl.load(w_indices + pid) if w_indices else tl.cast(pid, tl.int64)
-    w_curr_ptr = w + w_i * k * n
-    tl.store(w_ptr + pid, w_curr_ptr)
-
-    x_curr_ptr = x + tl.load(xy_indptr + pid) * k
-    tl.store(x_ptr + pid, x_curr_ptr)
-
-    y_curr_ptr = y + tl.load(xy_indptr + pid) * n
-    tl.store(y_ptr + pid, y_curr_ptr)
-
-    tl.store(x_ld_ptr + pid, k)
-    tl.store(w_ld_ptr + pid, k if w_column_major else n)
-    tl.store(y_ld_ptr + pid, n)
-
-
-@triton.jit
-def compute_sm90_group_gemm_args(
-    all_problems_ptr,
-    x_ptr,
-    w_ptr,
-    y_ptr,
-    x_stride_ptr,
-    w_stride_ptr,
-    y_stride_ptr,
-    x,
-    w,
-    y,
-    xy_indptr,
-    w_indices,
-    d_in,
-    d_out,
-    w_column_major,
-):
-
-    pid = tl.program_id(0)
-
-    m = tl.load(xy_indptr + pid + 1) - tl.load(xy_indptr + pid)
-    k, n = d_in, d_out
-
-    tl.store(all_problems_ptr + pid * 3, m)
-    tl.store(all_problems_ptr + pid * 3 + 1, n)
-    tl.store(all_problems_ptr + pid * 3 + 2, k)
-
-    w_i = tl.load(w_indices + pid) if w_indices else tl.cast(pid, tl.int64)
-    w_curr_ptr = w + w_i * k * n
-    tl.store(w_ptr + pid, w_curr_ptr)
-
-    x_curr_ptr = x + tl.load(xy_indptr + pid) * k
-    tl.store(x_ptr + pid, x_curr_ptr)
-
-    y_curr_ptr = y + tl.load(xy_indptr + pid) * n
-    tl.store(y_ptr + pid, y_curr_ptr)
-
-    tl.store(x_stride_ptr + pid, k)
-    tl.store(w_stride_ptr + pid, k if w_column_major else n)
-    tl.store(y_stride_ptr + pid, n)
-
-
 def launch_compute_sm80_group_gemm_args(
     x: torch.Tensor,
     weights: torch.Tensor,
@@ -340,6 +252,8 @@ def launch_compute_sm80_group_gemm_args(
     w_stride_data = torch.empty(batch_size, dtype=ld_type, device=device)
     y_stride_data = torch.empty(batch_size, dtype=ld_type, device=device)
 
+    from .triton.gemm import compute_sm80_group_gemm_args
+
     compute_sm80_group_gemm_args[(batch_size,)](
         all_problems,
         x_data,
@@ -400,6 +314,8 @@ def launch_compute_sm90_group_gemm_args(
     w_stride_data = torch.empty(batch_size, dtype=stride_type, device=device)
     y_stride_data = torch.empty(batch_size, dtype=stride_type, device=device)
 
+    from .triton.gemm import compute_sm90_group_gemm_args
+
     compute_sm90_group_gemm_args[(batch_size,)](
         all_problems,
         x_data,
diff --git a/flashinfer/page.py b/flashinfer/page.py
@@ -17,8 +17,6 @@
 from typing import Optional, Tuple, Union
 
 import torch
-import triton
-import triton.language as tl
 
 from .jit import FLASHINFER_CSRC_DIR, has_prebuilt_ops, load_cuda_ops
 from .utils import (
@@ -142,27 +140,6 @@ def _fake_append_paged_kv_cache_kernel(
     pass
 
 
-@triton.jit
-def get_batch_indices_positions_kernel(
-    append_indptr,
-    seq_lens_ptr,
-    batch_indices_ptr,
-    positions_ptr,
-    num_stages: tl.constexpr,
-):
-    batch_idx = tl.program_id(0)
-
-    batch_start = tl.load(append_indptr + batch_idx)
-    batch_end = tl.load(append_indptr + batch_idx + 1)
-    seq_len = tl.load(seq_lens_ptr + batch_idx)
-
-    for i in tl.range(batch_start, batch_end, 128, num_stages=num_stages):
-        offsets = tl.arange(0, 128) + i
-        mask = offsets < batch_end
-        tl.store(batch_indices_ptr + offsets, batch_idx, mask)
-        tl.store(positions_ptr + offsets, offsets + seq_len - batch_end, mask)
-
-
 def get_batch_indices_positions(
     append_indptr: torch.Tensor, seq_lens: torch.Tensor, nnz: int
 ) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -210,6 +187,8 @@ def get_batch_indices_positions(
     batch_size = append_indptr.size(0) - 1
     batch_indices = torch.empty((nnz,), device=append_indptr.device, dtype=torch.int32)
     positions = torch.empty((nnz,), device=append_indptr.device, dtype=torch.int32)
+    from .triton.page import get_batch_indices_positions_kernel
+
     get_batch_indices_positions_kernel[(batch_size,)](
         append_indptr, seq_lens, batch_indices, positions, num_stages=2
     )
diff --git a/flashinfer/triton/gemm.py b/flashinfer/triton/gemm.py
@@ -0,0 +1,104 @@
+"""
+Copyright (c) 2024 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def compute_sm80_group_gemm_args(
+    all_problems_ptr,
+    x_ptr,
+    w_ptr,
+    y_ptr,
+    x_ld_ptr,
+    w_ld_ptr,
+    y_ld_ptr,
+    x,
+    w,
+    y,
+    xy_indptr,
+    w_indices,
+    d_in,
+    d_out,
+    w_column_major,
+):
+
+    pid = tl.program_id(0)
+
+    m = tl.load(xy_indptr + pid + 1) - tl.load(xy_indptr + pid)
+    k, n = d_in, d_out
+
+    tl.store(all_problems_ptr + pid * 3, m)
+    tl.store(all_problems_ptr + pid * 3 + 1, n)
+    tl.store(all_problems_ptr + pid * 3 + 2, k)
+
+    w_i = tl.load(w_indices + pid) if w_indices else tl.cast(pid, tl.int64)
+    w_curr_ptr = w + w_i * k * n
+    tl.store(w_ptr + pid, w_curr_ptr)
+
+    x_curr_ptr = x + tl.load(xy_indptr + pid) * k
+    tl.store(x_ptr + pid, x_curr_ptr)
+
+    y_curr_ptr = y + tl.load(xy_indptr + pid) * n
+    tl.store(y_ptr + pid, y_curr_ptr)
+
+    tl.store(x_ld_ptr + pid, k)
+    tl.store(w_ld_ptr + pid, k if w_column_major else n)
+    tl.store(y_ld_ptr + pid, n)
+
+
+@triton.jit
+def compute_sm90_group_gemm_args(
+    all_problems_ptr,
+    x_ptr,
+    w_ptr,
+    y_ptr,
+    x_stride_ptr,
+    w_stride_ptr,
+    y_stride_ptr,
+    x,
+    w,
+    y,
+    xy_indptr,
+    w_indices,
+    d_in,
+    d_out,
+    w_column_major,
+):
+
+    pid = tl.program_id(0)
+
+    m = tl.load(xy_indptr + pid + 1) - tl.load(xy_indptr + pid)
+    k, n = d_in, d_out
+
+    tl.store(all_problems_ptr + pid * 3, m)
+    tl.store(all_problems_ptr + pid * 3 + 1, n)
+    tl.store(all_problems_ptr + pid * 3 + 2, k)
+
+    w_i = tl.load(w_indices + pid) if w_indices else tl.cast(pid, tl.int64)
+    w_curr_ptr = w + w_i * k * n
+    tl.store(w_ptr + pid, w_curr_ptr)
+
+    x_curr_ptr = x + tl.load(xy_indptr + pid) * k
+    tl.store(x_ptr + pid, x_curr_ptr)
+
+    y_curr_ptr = y + tl.load(xy_indptr + pid) * n
+    tl.store(y_ptr + pid, y_curr_ptr)
+
+    tl.store(x_stride_ptr + pid, k)
+    tl.store(w_stride_ptr + pid, k if w_column_major else n)
+    tl.store(y_stride_ptr + pid, n)
diff --git a/flashinfer/triton/page.py b/flashinfer/triton/page.py
@@ -0,0 +1,42 @@
+"""
+Copyright (c) 2025 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from typing import Optional, Tuple, Union
+
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def get_batch_indices_positions_kernel(
+    append_indptr,
+    seq_lens_ptr,
+    batch_indices_ptr,
+    positions_ptr,
+    num_stages: tl.constexpr,
+):
+    batch_idx = tl.program_id(0)
+
+    batch_start = tl.load(append_indptr + batch_idx)
+    batch_end = tl.load(append_indptr + batch_idx + 1)
+    seq_len = tl.load(seq_lens_ptr + batch_idx)
+
+    for i in tl.range(batch_start, batch_end, 128, num_stages=num_stages):
+        offsets = tl.arange(0, 128) + i
+        mask = offsets < batch_end
+        tl.store(batch_indices_ptr + offsets, batch_idx, mask)
+        tl.store(positions_ptr + offsets, offsets + seq_len - batch_end, mask)