misc: more benchmark scripts in Python (#1010)

yzh119 · web-flow · commit f579ca254ca3 · 2025-04-09T21:42:13.000-04:00
Move benchmarks from C++ side to python for easier performance tracking.
diff --git a/benchmarks/bench_batch_decode.py b/benchmarks/bench_batch_decode.py
@@ -0,0 +1,94 @@
+"""
+Copyright (c) 2024 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import numpy as np
+import torch
+from triton.testing import do_bench
+
+import flashinfer
+
+page_block_size = 16
+num_kv_heads = 4
+num_qo_heads = 32
+head_dim = 128
+
+
+def bench_batch_decode(
+    batch_size,
+    seq_len,
+    num_qo_heads,
+    num_kv_heads,
+    head_dim,
+    page_block_size,
+    q_dtype,
+    kv_dtype,
+):
+    np.random.seed(42)
+    seq_lens = torch.full((batch_size,), seq_len)
+    seq_lens_blocks = torch.ceil(seq_lens / page_block_size).int()
+    kv_indptr = torch.cat([torch.tensor([0]), torch.cumsum(seq_lens_blocks, 0)], dim=0)
+    kv_indptr = kv_indptr.int()
+    last_page_len = seq_lens - (seq_lens_blocks - 1) * page_block_size
+    last_page_len = last_page_len.int()
+    num_blocks = kv_indptr[-1].item()
+
+    q = torch.rand(batch_size, num_qo_heads, head_dim, dtype=q_dtype, device="cuda:0")
+    kv_data = torch.randn(
+        num_blocks, 2, page_block_size, num_kv_heads, head_dim, device="cuda:0"
+    ).to(kv_dtype)
+    workspace_buffer = torch.empty(
+        128 * 1024 * 1024, dtype=torch.uint8, device="cuda:0"
+    )
+    wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
+        workspace_buffer, kv_layout="NHD", use_tensor_cores=True
+    )
+    wrapper.plan(
+        kv_indptr.to(0),
+        torch.arange(num_blocks).int().to(0),
+        last_page_len.to(0),
+        num_qo_heads,
+        num_kv_heads,
+        head_dim,
+        page_block_size,
+        data_type=kv_dtype,
+        q_data_type=q_dtype,
+    )
+
+    ms = do_bench(lambda: wrapper.run(q, kv_data))
+
+    io = q.numel() * q.element_size() + kv_data.numel() * kv_data.element_size()
+    print(
+        f"batch_size={batch_size}, seq_len={seq_len}, num_qo_heads={num_qo_heads}, num_kv_heads={num_kv_heads}, head_dim={head_dim}, page_block_size={page_block_size}, q_dtype={q_dtype}, kv_dtype={kv_dtype}"
+    )
+    print(f"execution time: {ms}ms")
+    print(f"memory bandwidth: {io / ms / 1024 / 1024 :.2f} GB/s")
+
+
+if __name__ == "__main__":
+    for q_dtype in [torch.bfloat16]:
+        for kv_dtype in [torch.bfloat16, torch.float8_e4m3fn]:
+            for batch_size in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]:
+                for seq_len in [512, 1024, 2048, 4096, 8192, 16384]:
+                    bench_batch_decode(
+                        batch_size,
+                        seq_len,
+                        num_qo_heads,
+                        num_kv_heads,
+                        head_dim,
+                        page_block_size,
+                        q_dtype,
+                        kv_dtype,
+                    )
diff --git a/benchmarks/bench_grouped_gemm.py b/benchmarks/bench_grouped_gemm.py
@@ -0,0 +1,69 @@
+"""
+Copyright (c) 2024 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import numpy as np
+import torch
+from triton.testing import do_bench
+
+import flashinfer
+
+
+def bench_grouped_gemm(
+    batch_size, num_tokens_per_group, d_in, d_out, dtype, output_dtype
+):
+    np.random.seed(42)
+    W = torch.randn(batch_size, d_out, d_in, device="cuda:0").to(dtype)
+    X = torch.randn(batch_size * num_tokens_per_group, d_in, device="cuda:0").to(dtype)
+    Y = torch.empty(
+        batch_size * num_tokens_per_group, d_out, dtype=output_dtype, device="cuda:0"
+    )
+
+    workspace_buffer = torch.empty(32 * 1024 * 1024, dtype=torch.int8, device="cuda:0")
+    segment_gemm = flashinfer.gemm.SegmentGEMMWrapper(workspace_buffer, backend="auto")
+    seg_indptr = torch.arange(
+        0,
+        (batch_size + 1) * num_tokens_per_group,
+        num_tokens_per_group,
+        dtype=torch.int64,
+        device="cuda:0",
+    )
+
+    ms = do_bench(
+        lambda: segment_gemm.run(X, W, batch_size, True, out=Y, seg_indptr=seg_indptr)
+    )
+    flops = 2 * batch_size * num_tokens_per_group * d_in * d_out
+
+    print(
+        f"Config: batch_size={batch_size}, num_tokens_per_group={num_tokens_per_group}, d_in={d_in}, d_out={d_out}, dtype={dtype}, output_dtype={output_dtype}"
+    )
+    print(f"FLOPs: {flops / ms * 1e-9:.2f} TFLOPs/s")
+
+
+if __name__ == "__main__":
+    for dtype_in in [torch.float8_e4m3fn, torch.bfloat16]:
+        for dtype_out in [torch.bfloat16]:
+            for batch_size in [1, 3, 8, 16]:
+                for num_tokens_per_group in [32, 64, 128, 256, 512]:
+                    for d_in in [4096, 8192]:
+                        for d_out in [4096, 8192]:
+                            bench_grouped_gemm(
+                                batch_size,
+                                num_tokens_per_group,
+                                d_in,
+                                d_out,
+                                dtype_in,
+                                dtype_out,
+                            )
diff --git a/csrc/flashinfer_gemm_sm90_ops.cu b/csrc/flashinfer_gemm_sm90_ops.cu
@@ -18,7 +18,8 @@
 void CutlassSegmentGEMMSM90(at::Tensor float_workspace_buffer, at::Tensor int_workspace_buffer,
                             at::Tensor all_problems, at::Tensor x_ptr, at::Tensor w_ptr,
                             at::Tensor y_ptr, at::Tensor x_stride, at::Tensor weight_stride,
-                            at::Tensor y_stride, at::Tensor empty_x_data, bool weight_column_major);
+                            at::Tensor y_stride, at::Tensor empty_x_data, at::Tensor empty_y_data,
+                            bool weight_column_major);
 
 TORCH_LIBRARY_FRAGMENT(TORCH_EXTENSION_NAME, m) {
   // "Cutlass Segment GEMM operator for SM90"
diff --git a/csrc/flashinfer_ops_sm90.cu b/csrc/flashinfer_ops_sm90.cu
@@ -19,7 +19,8 @@
 void CutlassSegmentGEMMSM90(at::Tensor float_workspace_buffer, at::Tensor int_workspace_buffer,
                             at::Tensor all_problems, at::Tensor x_ptr, at::Tensor w_ptr,
                             at::Tensor y_ptr, at::Tensor x_stride, at::Tensor weight_stride,
-                            at::Tensor y_stride, at::Tensor empty_x_data, bool weight_column_major);
+                            at::Tensor y_stride, at::Tensor empty_x_data, at::Tensor empty_y_data,
+                            bool weight_column_major);
 
 void single_prefill_with_kv_cache_sm90(
     at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor tmp, at::Tensor o,
diff --git a/csrc/group_gemm_bf16_bf16_sm90.cu b/csrc/group_gemm_bf16_bf16_sm90.cu
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2024 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <flashinfer/gemm/group_gemm_sm90.cuh>
+
+using namespace flashinfer;
+using namespace flashinfer::group_gemm;
+
+namespace flashinfer {
+namespace group_gemm {
+
+template cudaError_t CutlassSegmentGEMMSM90Run<cutlass::bfloat16_t, cutlass::bfloat16_t>(
+    void* float_buffer, size_t float_buffer_size_in_bytes, void* int_buffer,
+    size_t int_buffer_size_in_bytes, void* all_problems, int64_t batch_size, void* x, void* w,
+    void* y, void* x_stride, void* w_stride, void* y_stride, bool weight_column_major,
+    cudaStream_t stream);
+
+};  // namespace group_gemm
+};  // namespace flashinfer
diff --git a/csrc/group_gemm_e4m3_bf16_sm90.cu b/csrc/group_gemm_e4m3_bf16_sm90.cu
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2024 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <flashinfer/gemm/group_gemm_sm90.cuh>
+
+using namespace flashinfer;
+using namespace flashinfer::group_gemm;
+
+namespace flashinfer {
+namespace group_gemm {
+
+template cudaError_t CutlassSegmentGEMMSM90Run<cutlass::float_e4m3_t, cutlass::bfloat16_t>(
+    void* float_buffer, size_t float_buffer_size_in_bytes, void* int_buffer,
+    size_t int_buffer_size_in_bytes, void* all_problems, int64_t batch_size, void* x, void* w,
+    void* y, void* x_stride, void* w_stride, void* y_stride, bool weight_column_major,
+    cudaStream_t stream);
+
+};  // namespace group_gemm
+};  // namespace flashinfer
diff --git a/csrc/group_gemm_e4m3_f16_sm90.cu b/csrc/group_gemm_e4m3_f16_sm90.cu
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2024 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <flashinfer/gemm/group_gemm_sm90.cuh>
+
+using namespace flashinfer;
+using namespace flashinfer::group_gemm;
+
+namespace flashinfer {
+namespace group_gemm {
+
+template cudaError_t CutlassSegmentGEMMSM90Run<cutlass::float_e4m3_t, cutlass::half_t>(
+    void* float_buffer, size_t float_buffer_size_in_bytes, void* int_buffer,
+    size_t int_buffer_size_in_bytes, void* all_problems, int64_t batch_size, void* x, void* w,
+    void* y, void* x_stride, void* w_stride, void* y_stride, bool weight_column_major,
+    cudaStream_t stream);
+
+};  // namespace group_gemm
+};  // namespace flashinfer
diff --git a/csrc/group_gemm_e5m2_bf16_sm90.cu b/csrc/group_gemm_e5m2_bf16_sm90.cu
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2024 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <flashinfer/gemm/group_gemm_sm90.cuh>
+
+using namespace flashinfer;
+using namespace flashinfer::group_gemm;
+
+namespace flashinfer {
+namespace group_gemm {
+
+template cudaError_t CutlassSegmentGEMMSM90Run<cutlass::float_e5m2_t, cutlass::bfloat16_t>(
+    void* float_buffer, size_t float_buffer_size_in_bytes, void* int_buffer,
+    size_t int_buffer_size_in_bytes, void* all_problems, int64_t batch_size, void* x, void* w,
+    void* y, void* x_stride, void* w_stride, void* y_stride, bool weight_column_major,
+    cudaStream_t stream);
+
+};  // namespace group_gemm
+};  // namespace flashinfer
diff --git a/csrc/group_gemm_e5m2_f16_sm90.cu b/csrc/group_gemm_e5m2_f16_sm90.cu
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2024 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <flashinfer/gemm/group_gemm_sm90.cuh>
+
+using namespace flashinfer;
+using namespace flashinfer::group_gemm;
+
+namespace flashinfer {
+namespace group_gemm {
+
+template cudaError_t CutlassSegmentGEMMSM90Run<cutlass::float_e5m2_t, cutlass::half_t>(
+    void* float_buffer, size_t float_buffer_size_in_bytes, void* int_buffer,
+    size_t int_buffer_size_in_bytes, void* all_problems, int64_t batch_size, void* x, void* w,
+    void* y, void* x_stride, void* w_stride, void* y_stride, bool weight_column_major,
+    cudaStream_t stream);
+
+};  // namespace group_gemm
+};  // namespace flashinfer
diff --git a/csrc/group_gemm_f16_f16_sm90.cu b/csrc/group_gemm_f16_f16_sm90.cu
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2024 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <flashinfer/gemm/group_gemm_sm90.cuh>
+
+using namespace flashinfer;
+using namespace flashinfer::group_gemm;
+
+namespace flashinfer {
+namespace group_gemm {
+
+template cudaError_t CutlassSegmentGEMMSM90Run<cutlass::half_t, cutlass::half_t>(
+    void* float_buffer, size_t float_buffer_size_in_bytes, void* int_buffer,
+    size_t int_buffer_size_in_bytes, void* all_problems, int64_t batch_size, void* x, void* w,
+    void* y, void* x_stride, void* w_stride, void* y_stride, bool weight_column_major,
+    cudaStream_t stream);
+
+};  // namespace group_gemm
+};  // namespace flashinfer
diff --git a/csrc/group_gemm_sm90.cu b/csrc/group_gemm_sm90.cu
diff --git a/flashinfer/gemm.py b/flashinfer/gemm.py
diff --git a/include/flashinfer/gemm/group_gemm_sm90.cuh b/include/flashinfer/gemm/group_gemm_sm90.cuh
diff --git a/setup.py b/setup.py