sgl-project
diff --git a/‎.github/workflows/pr-test.yml
Lines changed: 6 additions & 0 deletions b/‎.github/workflows/pr-test.yml
Lines changed: 6 additions & 0 deletions
diff --git a/‎benchmark/lora/launch_server.py
Lines changed: 16 additions & 1 deletion b/‎benchmark/lora/launch_server.py
Lines changed: 16 additions & 1 deletion
diff --git a/‎python/sglang/srt/layers/linear.py
Lines changed: 2 additions & 0 deletions b/‎python/sglang/srt/layers/linear.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎python/sglang/srt/lora/layers.py
Lines changed: 68 additions & 0 deletions b/‎python/sglang/srt/lora/layers.py
Lines changed: 68 additions & 0 deletions
diff --git a/‎python/sglang/srt/lora/lora.py
Lines changed: 2 additions & 22 deletions b/‎python/sglang/srt/lora/lora.py
Lines changed: 2 additions & 22 deletions
diff --git a/‎python/sglang/srt/lora/lora_manager.py
Lines changed: 47 additions & 23 deletions b/‎python/sglang/srt/lora/lora_manager.py
Lines changed: 47 additions & 23 deletions
@@ -127,6 +127,12 @@ jobs:
           cd test/srt
           python3 test_mla_tp.py
 
+      - name: Test lora tensor parallelism (TP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt/models/lora
+          python3 test_lora_tp.py
+
   performance-test-1-gpu-part-1:
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
         github.event.pull_request.draft == false
 
@@ -22,7 +22,10 @@ def launch_server(args):
     cmd += f"--disable-radix --disable-cuda-graph "
     cmd += f"--max-loras-per-batch {args.max_loras_per_batch} "
     cmd += f"--max-running-requests {args.max_running_requests} "
-    cmd += f"--lora-backend {args.lora_backend}"
+    cmd += f"--lora-backend {args.lora_backend} "
+    cmd += f"--tp-size {args.tp_size} "
+    if args.disable_custom_all_reduce:
+        cmd += "--disable-custom-all-reduce"
     print(cmd)
     os.system(cmd)
 
@@ -48,6 +51,18 @@ def launch_server(args):
         type=str,
         default="triton",
     )
+    parser.add_argument(
+        "--tp-size",
+        type=int,
+        default=1,
+        help="Tensor parallel size for distributed inference",
+    )
+    # disable_custom_all_reduce
+    parser.add_argument(
+        "--disable-custom-all-reduce",
+        action="store_true",
+        help="Disable custom all reduce when device does not support p2p communication",
+    )
     args = parser.parse_args()
 
     launch_server(args)
@@ -782,6 +782,8 @@ def __init__(
         else:
             self.num_kv_heads = divide(self.total_num_kv_heads, tp_size)
             self.num_kv_head_replicas = 1
+        self.q_proj_shard_size = self.num_heads * self.head_size
+        self.kv_proj_shard_size = self.num_kv_heads * self.head_size
         input_size = self.hidden_size
         output_size = (
             (self.num_heads + 2 * self.num_kv_heads) * tp_size * self.head_size
 
@@ -1,3 +1,5 @@
+from typing import List, Tuple
+
 import torch
 from torch import nn
 
@@ -38,8 +40,22 @@ def forward(self, x: torch.Tensor):
     def set_lora_info(self, *args):
         pass
 
+    def slice_lora_a_weights(self, A: torch.Tensor, tp_rank: int):
+        pass
+
+    def slice_lora_b_weights(self, B: torch.Tensor, tp_rank: int):
+        pass
+
 
 class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
+    """
+    Vocab parallel embedding layer with support for LoRA (Low-Rank Adaptation).
+
+    Note: The current version does not yet implement the LoRA functionality.
+    This class behaves exactly the same as the base VocabParallelEmbedding.
+    Future versions will integrate LoRA functionality to support efficient parameter fine-tuning.
+    """
+
     def __init__(
         self,
         base_layer: VocabParallelEmbedding,
@@ -101,6 +117,16 @@ def forward(self, input_: torch.Tensor):
         output_bias = self.base_layer.bias if self.base_layer.skip_bias_add else None
         return output, output_bias
 
+    def slice_lora_a_weights(self, A: torch.Tensor, tp_rank: int):
+        return A
+
+    def slice_lora_b_weights(self, B: torch.Tensor, tp_rank: int):
+        shard_size = self.base_layer.output_partition_sizes[0]
+        start_idx = tp_rank * shard_size
+        end_idx = (tp_rank + 1) * shard_size
+        B = B[start_idx:end_idx, :]
+        return B
+
 
 class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
     def __init__(
@@ -120,6 +146,7 @@ def set_lora_info(
         self.set_lora = True
         self.A_buffer_gate_up = A_buffer
         if self.lora_backend.fuse_stacked_lora_b:
+            # TODO: avoid using contiguous() in GPU.
             # B_buffer_gate_up: (num_lora, 2 * output_dim, r)
             self.B_buffer_gate_up = torch.cat(
                 (B_buffer[0], B_buffer[1]), dim=-2
@@ -142,6 +169,16 @@ def apply_lora(self, base_output: torch.Tensor, x: torch.Tensor) -> torch.Tensor
             else base_output + lora_output * self.scaling
         )
 
+    def slice_lora_a_weights(self, A: torch.Tensor, tp_rank: int):
+        return A
+
+    def slice_lora_b_weights(self, B: torch.Tensor, tp_rank: int):
+        # Since the outputs for both gate and up are identical, we use a random one.
+        shard_size = self.base_layer.output_partition_sizes[0]
+        start_idx = tp_rank * shard_size
+        end_idx = (tp_rank + 1) * shard_size
+        return B[:, start_idx:end_idx, :]
+
 
 class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
     def init__(
@@ -210,6 +247,27 @@ def apply_lora(self, base_output: torch.Tensor, x: torch.Tensor) -> torch.Tensor
             else base_output + lora_output * self.scaling
         )
 
+    def slice_lora_a_weights(self, A: torch.Tensor, tp_rank: int):
+        return A
+
+    def slice_lora_b_weights(
+        self, B: List[torch.Tensor], tp_rank: int
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        B_q, B_kv = B
+        base_layer = self.base_layer
+        q_proj_shard_size = base_layer.q_proj_shard_size
+        kv_proj_shard_size = base_layer.kv_proj_shard_size
+        num_kv_head_replicas = base_layer.num_kv_head_replicas
+
+        q_start_idx = q_proj_shard_size * tp_rank
+        q_end_idx = q_start_idx + q_proj_shard_size
+
+        kv_shard_id = tp_rank // num_kv_head_replicas
+        kv_start_idx = kv_proj_shard_size * kv_shard_id
+        kv_end_idx = kv_start_idx + kv_proj_shard_size
+
+        return B_q[q_start_idx:q_end_idx, :], B_kv[:, kv_start_idx:kv_end_idx, :]
+
 
 class RowParallelLinearWithLoRA(BaseLayerWithLoRA):
     def __init__(
@@ -274,6 +332,16 @@ def forward(self, input_: torch.Tensor):
             output_bias = self.base_layer.bias
         return output, output_bias
 
+    def slice_lora_a_weights(self, A: torch.Tensor, tp_rank: int):
+        shard_size = self.base_layer.input_size_per_partition
+        start_idx = tp_rank * shard_size
+        end_idx = (tp_rank + 1) * shard_size
+        A = A[:, start_idx:end_idx].contiguous()
+        return A
+
+    def slice_lora_b_weights(self, B: torch.Tensor, tp_rank: int):
+        return B
+
 
 def get_lora_layer(
     layer: nn.Module, lora_rank: int, scaling: int, lora_backend: BaseLoRABackend
 
@@ -39,16 +39,9 @@ def __init__(self, config: LoRAConfig, base_hf_config: AutoConfig):
         super().__init__()
         self.config: LoRAConfig = config
         self.base_hf_config: AutoConfig = base_hf_config
-        self.weights: Dict[str, torch.Tensor] = {}
-        self.weight_gpu: Dict[str, torch.Tensor] = {}
-
-    def load_to_gpu(self):
-        for name, weight in self.weights.items():
-            self.weight_gpu[name] = weight.to(torch.float16).to("cuda")
 
-    def offload_from_gpu(self):
-        for name, weight in self.weights.items():
-            self.weight_gpu[name] = None
+        # lora weights in cpu. The weights are loaded from checkpoint.
+        self.weights: Dict[str, torch.Tensor] = {}
 
 
 class LoRAAdapter(nn.Module):
@@ -77,19 +70,6 @@ def __init__(
         )
 
         self.weights: Dict[str, torch.Tensor] = {}
-        self.weights_gpu: Dict[str, torch.Tensor] = {}
-
-    def load_to_gpu(self):
-        for name, weight in self.weights.items():
-            self.weights_gpu[name] = weight.to(torch.float16).to("cuda")
-        for layer in self.layers:
-            layer.load_to_gpu()
-
-    def offload_from_gpu(self):
-        for name, weight in self.weights.items():
-            self.weights_gpu[name] = None
-        for layer in self.layers:
-            layer.offload_from_gpu()
 
     # initialize the LoRA weights to cpu
     def initialize_weights(self):
 
@@ -23,7 +23,7 @@
 from sglang.srt.configs.load_config import LoadConfig
 from sglang.srt.hf_transformers_utils import AutoConfig
 from sglang.srt.lora.backend import BaseLoRABackend, get_backend_from_name
-from sglang.srt.lora.layers import get_lora_layer
+from sglang.srt.lora.layers import BaseLayerWithLoRA, get_lora_layer
 from sglang.srt.lora.lora import LoRAAdapter
 from sglang.srt.lora.lora_config import LoRAConfig
 from sglang.srt.lora.mem_pool import LoRAMemoryPool
@@ -51,13 +51,18 @@ def __init__(
         load_config: LoadConfig,
         dtype: torch.dtype,
         lora_backend: str = "triton",
+        tp_size: int = 1,
+        tp_rank: int = 0,
     ):
         self.base_model: torch.nn.Module = base_model
         self.lora_paths: Dict[str, str] = lora_paths
         self.base_hf_config: AutoConfig = base_hf_config
         self.max_loras_per_batch: int = max_loras_per_batch
         self.load_config: LoadConfig = load_config
         self.dtype: torch.dtype = dtype
+        self.device: torch.device = next(self.base_model.parameters()).device
+        self.tp_size: int = tp_size
+        self.tp_rank: int = tp_rank
 
         # LoRA backend for running sgemm kernels
         logger.info(f"Using {lora_backend} as backend of LoRA kernels.")
@@ -110,7 +115,13 @@ def init_loras(self):
     def init_lora_memory_pool(self):
         # Initialize memory pool
         self.memory_pool = LoRAMemoryPool(
-            self.base_hf_config, self.max_loras_per_batch, self.max_lora_dim, self.dtype
+            self.base_hf_config,
+            self.max_loras_per_batch,
+            self.max_lora_dim,
+            self.dtype,
+            self.tp_size,
+            self.tp_rank,
+            self.lora_modules,
         )
 
         # Initialize target lora modules in memory pool
@@ -131,12 +142,12 @@ def prepare_lora_batch(self, forward_batch: ForwardBatch):
         seg_lens = (
             forward_batch.extend_seq_lens
             if forward_batch.forward_mode.is_extend()
-            else torch.ones(bs, device="cuda")
+            else torch.ones(bs, device=self.device)
         )
-        seg_indptr = torch.zeros((bs + 1,), dtype=torch.int32, device="cuda")
+        seg_indptr = torch.zeros((bs + 1,), dtype=torch.int32, device=self.device)
         seg_indptr[1:] = torch.cumsum(seg_lens, dim=0)
         max_len = int(torch.max(seg_lens))
-        weight_indices = torch.empty((bs,), dtype=torch.int64, device="cuda")
+        weight_indices = torch.empty((bs,), dtype=torch.int64, device=self.device)
         for i, lora_path in enumerate(forward_batch.lora_paths):
             weight_indices[i] = self.memory_pool.get_buffer_id(lora_path)
 
@@ -150,22 +161,32 @@ def prepare_lora_batch(self, forward_batch: ForwardBatch):
         self.lora_backend.set_batch_info(batch_info)
 
         # call set_lora_info for each lora modules
-        for module_name, module in self.lora_modules:
-            layer_id = get_layer_id(module_name)
-            if "qkv_proj" not in module_name:
-                weight_name = get_weight_name(
-                    module_name, self.lora_weight_names, LoRAType.LORA_A
-                )
-                module.set_lora_info(
-                    self.memory_pool.get_tensor(weight_name, layer_id, LoRAType.LORA_A),
-                    self.memory_pool.get_tensor(weight_name, layer_id, LoRAType.LORA_B),
-                )
-            else:
-                module.set_lora_info(
-                    self.memory_pool.get_tensor("qkv_proj", layer_id, LoRAType.LORA_A),
-                    self.memory_pool.get_tensor("q_proj", layer_id, LoRAType.LORA_B),
-                    self.memory_pool.get_tensor("kv_proj", layer_id, LoRAType.LORA_B),
-                )
+        for layer_id, modules in self.lora_modules.items():
+            for module_name, module in modules:
+                if "qkv_proj" in module_name:
+                    module.set_lora_info(
+                        self.memory_pool.get_tensor(
+                            "qkv_proj", layer_id, LoRAType.LORA_A
+                        ),
+                        self.memory_pool.get_tensor(
+                            "q_proj", layer_id, LoRAType.LORA_B
+                        ),
+                        self.memory_pool.get_tensor(
+                            "kv_proj", layer_id, LoRAType.LORA_B
+                        ),
+                    )
+                else:
+                    weight_name = get_weight_name(
+                        module_name, self.lora_weight_names, LoRAType.LORA_A
+                    )
+                    module.set_lora_info(
+                        self.memory_pool.get_tensor(
+                            weight_name, layer_id, LoRAType.LORA_A
+                        ),
+                        self.memory_pool.get_tensor(
+                            weight_name, layer_id, LoRAType.LORA_B
+                        ),
+                    )
 
     def set_lora_module(self, module_name, module):
         lora_module = get_lora_layer(
@@ -182,10 +203,13 @@ def convert_to_lora_layers(self):
         )
 
         # Monkey patch to use the LoRA version layers
-        self.lora_modules: List[Tuple[str, torch.nn.Module]] = []
+        self.lora_modules: Dict[int, List[Tuple[str, BaseLayerWithLoRA]]] = {
+            i: [] for i in range(self.base_hf_config.num_hidden_layers)
+        }
         for module_name, module in self.base_model.named_modules():
             # The module should be converted if it is included in target_names
             if module_name.split(".")[-1] in customized_target_names:
-                self.lora_modules.append(
+                layer_id = get_layer_id(module_name)
+                self.lora_modules[layer_id].append(
                     (module_name, self.set_lora_module(module_name, module))
                 )