enable cuda graph for lora

Qiaolin-Yu · Beichen-Ma · Qiaolin-Yu · commit c9ff88a8cd3f · 2025-04-26T16:58:33.000-07:00
Co-authored-by: Beichen Ma &lt;mabeichen12@gmail.com&gt;
diff --git a/benchmark/lora/launch_server.py b/benchmark/lora/launch_server.py
@@ -19,7 +19,7 @@ def launch_server(args):
         for i in range(NUM_LORAS):
             lora_name = f"lora{i}"
             cmd += f"{lora_name}={lora_path} "
-    cmd += f"--disable-radix --disable-cuda-graph "
+    cmd += f"--disable-radix "
     cmd += f"--max-loras-per-batch {args.max_loras_per_batch} "
     cmd += f"--max-running-requests {args.max_running_requests} "
     cmd += f"--lora-backend {args.lora_backend} "
diff --git a/docs/backend/server_arguments.md b/docs/backend/server_arguments.md
@@ -133,7 +133,7 @@ Please consult the documentation below to learn more about the parameters you ma
 
 ## LoRA
 
-* `lora_paths`: You may provide a list of adapters to your model as a list. Each batch element will get model response with the corresponding lora adapter applied. Currently `cuda_graph` and `radix_attention` are not supported with this option so you need to disable them manually. We are still working on through these [issues](https://github.com/sgl-project/sglang/issues/2929).
+* `lora_paths`: You may provide a list of adapters to your model as a list. Each batch element will get model response with the corresponding lora adapter applied. Currently `radix_attention` is not supported with this option so you need to disable it manually. We are still working on through these [issues](https://github.com/sgl-project/sglang/issues/2929).
 * `max_loras_per_batch`: Maximum number of LoRAs in a running batch including base model.
 * `lora_backend`: The backend of running GEMM kernels for Lora modules, can be one of `triton` or `flashinfer`. Defaults to be `triton`.
 
diff --git a/python/sglang/srt/lora/layers.py b/python/sglang/srt/lora/layers.py
@@ -127,6 +127,7 @@ def __init__(
         lora_backend: BaseLoRABackend,
     ) -> None:
         super().__init__(base_layer, lora_backend)
+        self.B_buffer_gate_up = None
 
     def set_lora_info(
         self,
@@ -138,9 +139,20 @@ def set_lora_info(
         if self.lora_backend.fuse_stacked_lora_b:
             # TODO: avoid using contiguous() in GPU.
             # B_buffer_gate_up: (num_lora, 2 * output_dim, r)
-            self.B_buffer_gate_up = torch.cat(
-                (B_buffer[0], B_buffer[1]), dim=-2
-            ).contiguous()
+            if self.B_buffer_gate_up is None:
+                self.B_buffer_gate_up = torch.empty(
+                    (
+                        B_buffer[0].shape[0],
+                        2 * B_buffer[0].shape[1],
+                        B_buffer[0].shape[2],
+                    ),
+                    dtype=B_buffer[0].dtype,
+                    device=B_buffer[0].device,
+                ).contiguous()
+                # TODO: avoid using contiguous() in GPU.
+                # B_buffer_gate_up: (num_lora, 2 * output_dim, r)
+            self.B_buffer_gate_up[:, : B_buffer[0].shape[1], :].copy_(B_buffer[0])
+            self.B_buffer_gate_up[:, B_buffer[0].shape[1] :, :].copy_(B_buffer[1])
         else:
             self.B_buffer_gate_up = (B_buffer[0], B_buffer[1])
 
@@ -171,12 +183,15 @@ def slice_lora_b_weights(self, B: torch.Tensor, tp_rank: int):
 
 
 class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
-    def init__(
+    def __init__(
         self,
         base_layer: QKVParallelLinear,
         lora_backend: BaseLoRABackend,
     ) -> None:
         super().__init__(base_layer, lora_backend)
+        self.output_offset = None
+        self.B_buffer_qkv = None
+        self.max_qkv_out_dim = 0
 
     def set_lora_info(
         self,
@@ -194,9 +209,27 @@ def set_lora_info(
             output_dim_q, output_dim_kv = B_buffer_q.shape[-2], B_buffer_kv.shape[-2]
 
             # B_buffer_qkv: (num_lora, output_dim_q + 2 * output_dim_kv, r)
-            self.B_buffer_qkv = torch.cat(
-                (B_buffer_q[0], B_buffer_kv[0], B_buffer_kv[1]), dim=-2
-            ).contiguous()
+            # self.B_buffer_qkv = torch.cat(
+            #     (B_buffer_q[0], B_buffer_kv[0], B_buffer_kv[1]), dim=-2
+            # ).contiguous()
+
+            if self.B_buffer_qkv is None:
+                self.B_buffer_qkv = torch.empty(
+                    (
+                        B_buffer_q[0].shape[0],
+                        output_dim_q + 2 * output_dim_kv,
+                        B_buffer_q[0].shape[2],
+                    ),
+                    dtype=B_buffer_q[0].dtype,
+                    device=B_buffer_q[0].device,
+                ).contiguous()
+            self.B_buffer_qkv[:, :output_dim_q, :].copy_(B_buffer_q[0])
+            self.B_buffer_qkv[:, output_dim_q : output_dim_q + output_dim_kv, :].copy_(
+                B_buffer_kv[0]
+            )
+            self.B_buffer_qkv[:, output_dim_q + output_dim_kv :, :].copy_(
+                B_buffer_kv[1]
+            )
 
             # Offsets of q/k/v in output dimension
             self.output_offset = torch.tensor(
diff --git a/python/sglang/srt/lora/lora_manager.py b/python/sglang/srt/lora/lora_manager.py
@@ -53,11 +53,13 @@ def __init__(
         lora_backend: str = "triton",
         tp_size: int = 1,
         tp_rank: int = 0,
+        max_bs_in_cuda_graph: int = 0,
     ):
         self.base_model: torch.nn.Module = base_model
         self.lora_paths: Dict[str, str] = lora_paths
         self.base_hf_config: AutoConfig = base_hf_config
         self.max_loras_per_batch: int = max_loras_per_batch
+        self.max_bs_in_cuda_graph: int = max_bs_in_cuda_graph
         self.load_config: LoadConfig = load_config
         self.dtype: torch.dtype = dtype
         self.device: torch.device = next(self.base_model.parameters()).device
@@ -72,6 +74,23 @@ def __init__(
         self.init_loras()
         self.init_lora_memory_pool()
 
+    def init_cuda_graph_batch_info(self, max_bs_in_cuda_graph: int):
+        self.max_bs_in_cuda_graph = max_bs_in_cuda_graph
+        with torch.device("cuda"):
+            self.cuda_graph_batch_info = LoRABatchInfo(
+                bs=self.max_bs_in_cuda_graph,
+                seg_lens=torch.zeros(self.max_bs_in_cuda_graph, dtype=torch.int32),
+                seg_indptr=torch.zeros(
+                    self.max_bs_in_cuda_graph + 1, dtype=torch.int32
+                ),
+                max_len=0,
+                weight_indices=torch.zeros(
+                    self.max_bs_in_cuda_graph, dtype=torch.int32
+                ),
+                lora_ranks=torch.zeros(self.max_loras_per_batch, dtype=torch.int32),
+                scalings=torch.zeros(self.max_loras_per_batch, dtype=torch.float),
+            )
+
     def init_loras(self):
         # Config of each LoRA adapter
         self.configs: Dict[str, LoRAConfig] = {}
@@ -140,39 +159,71 @@ def prepare_lora_batch(self, forward_batch: ForwardBatch):
         if cur_uids == set([None]):
             return
 
-        # set up batch info shared by all lora moruldes
+        # set up batch info shared by all lora modules
         bs = forward_batch.batch_size
-        seg_lens = (
-            forward_batch.extend_seq_lens
-            if forward_batch.forward_mode.is_extend()
-            else torch.ones(bs, device=self.device)
-        )
-        seg_indptr = torch.zeros((bs + 1,), dtype=torch.int32, device=self.device)
-        seg_indptr[1:] = torch.cumsum(seg_lens, dim=0)
-        max_len = int(torch.max(seg_lens))
-        weight_indices = torch.empty((bs,), dtype=torch.int64, device=self.device)
 
-        lora_ranks = torch.empty(
-            (self.max_loras_per_batch,), dtype=torch.int64, device="cuda"
-        )
-        scalings = torch.empty(
-            (self.max_loras_per_batch,), dtype=torch.float, device="cuda"
-        )
-        for i, lora_path in enumerate(forward_batch.lora_paths):
-            weight_indices[i] = self.memory_pool.get_buffer_id(lora_path)
-            lora = self.loras[lora_path]
-            lora_ranks[weight_indices[i]] = lora.config.hf_config["r"]
-            scalings[weight_indices[i]] = lora.scaling
-
-        batch_info = LoRABatchInfo(
-            bs=bs,
-            seg_lens=seg_lens,
-            seg_indptr=seg_indptr,
-            max_len=max_len,
-            weight_indices=weight_indices,
-            lora_ranks=lora_ranks,
-            scalings=scalings,
-        )
+        if bs <= self.max_bs_in_cuda_graph:
+            # Do in-place update for cuda graph
+            self.cuda_graph_batch_info.bs = bs
+            if forward_batch.forward_mode.is_extend():
+                self.cuda_graph_batch_info.seg_lens[:bs].copy_(
+                    forward_batch.extend_seq_lens
+                )
+            else:
+                self.cuda_graph_batch_info.seg_lens[:bs].fill_(1)
+            self.cuda_graph_batch_info.seg_indptr[0] = 0
+            torch.cumsum(
+                self.cuda_graph_batch_info.seg_lens[:bs],
+                dim=0,
+                out=self.cuda_graph_batch_info.seg_indptr[1 : bs + 1],
+            )
+            self.cuda_graph_batch_info.max_len = int(
+                torch.max(self.cuda_graph_batch_info.seg_lens[:bs])
+            )
+
+            for i, lora_path in enumerate(forward_batch.lora_paths):
+                self.cuda_graph_batch_info.weight_indices[i] = (
+                    self.memory_pool.get_buffer_id(lora_path)
+                )
+                lora = self.loras[lora_path]
+                self.cuda_graph_batch_info.lora_ranks[
+                    self.cuda_graph_batch_info.weight_indices[i]
+                ] = lora.config.hf_config["r"]
+                self.cuda_graph_batch_info.scalings[
+                    self.cuda_graph_batch_info.weight_indices[i]
+                ] = lora.scaling
+            batch_info = self.cuda_graph_batch_info
+        else:
+            seg_lens = (
+                forward_batch.extend_seq_lens
+                if forward_batch.forward_mode.is_extend()
+                else torch.ones(bs, device=self.device)
+            )
+            seg_indptr = torch.zeros((bs + 1,), dtype=torch.int32, device=self.device)
+            seg_indptr[1:] = torch.cumsum(seg_lens, dim=0)
+            max_len = int(torch.max(seg_lens))
+            weight_indices = torch.empty((bs,), dtype=torch.int64, device=self.device)
+
+            lora_ranks = torch.empty(
+                (self.max_loras_per_batch,), dtype=torch.int64, device="cuda"
+            )
+            scalings = torch.empty(
+                (self.max_loras_per_batch,), dtype=torch.float, device="cuda"
+            )
+            for i, lora_path in enumerate(forward_batch.lora_paths):
+                weight_indices[i] = self.memory_pool.get_buffer_id(lora_path)
+                lora = self.loras[lora_path]
+                lora_ranks[weight_indices[i]] = lora.config.hf_config["r"]
+                scalings[weight_indices[i]] = lora.scaling
+            batch_info = LoRABatchInfo(
+                bs=bs,
+                seg_lens=seg_lens,
+                seg_indptr=seg_indptr,
+                max_len=max_len,
+                weight_indices=weight_indices,
+                lora_ranks=lora_ranks,
+                scalings=scalings,
+            )
         self.lora_backend.set_batch_info(batch_info)
 
         # call set_lora_info for each lora modules
diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py
@@ -220,6 +220,9 @@ def __init__(self, model_runner: ModelRunner):
         if self.enable_torch_compile:
             set_torch_compile_config()
 
+        if self.model_runner.server_args.lora_paths is not None:
+            self.model_runner.lora_manager.init_cuda_graph_batch_info(self.max_bs)
+
         # Graph inputs
         with torch.device("cuda"):
             self.input_ids = torch.zeros((self.max_num_token,), dtype=torch.int64)
@@ -403,6 +406,13 @@ def capture_one_batch_size(self, bs: int, forward: Callable):
             self.capture_hidden_mode = (
                 spec_info.capture_hidden_mode if spec_info else CaptureHiddenMode.NULL
             )
+        if self.model_runner.server_args.lora_paths is not None:
+            # Currently, if the lora_path in `lora_paths` is None, the lora backend will use a
+            # different logic to handle lora, so we need to set `lora_paths` to a list of non-None
+            # values if lora is enabled.
+            lora_paths = [next(iter(self.model_runner.server_args.lora_paths))] * bs
+        else:
+            lora_paths = None
 
         forward_batch = ForwardBatch(
             forward_mode=self.capture_forward_mode,
@@ -424,8 +434,12 @@ def capture_one_batch_size(self, bs: int, forward: Callable):
             spec_algorithm=self.model_runner.spec_algorithm,
             spec_info=spec_info,
             capture_hidden_mode=self.capture_hidden_mode,
+            lora_paths=lora_paths,
         )
 
+        if lora_paths is not None:
+            self.model_runner.lora_manager.prepare_lora_batch(forward_batch)
+
         # Attention backend
         self.model_runner.attn_backend.init_forward_metadata_capture_cuda_graph(
             bs,
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
@@ -1230,7 +1230,6 @@ def check_server_args(self):
         assert (
             self.max_loras_per_batch > 0
             # FIXME
-            and (self.lora_paths is None or self.disable_cuda_graph)
             and (self.lora_paths is None or self.disable_radix_cache)
         ), "compatibility of lora and cuda graph and radix attention is in progress"
         assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative"
diff --git a/test/srt/models/lora/test_lora_cuda_graph.py b/test/srt/models/lora/test_lora_cuda_graph.py
diff --git a/test/srt/models/lora/utils.py b/test/srt/models/lora/utils.py