Skip to content

Update vLLM patch #13064

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 10, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 84 additions & 34 deletions docker/llm/serving/xpu/docker/vllm_for_multi_arc.patch
Original file line number Diff line number Diff line change
Expand Up @@ -39043,7 +39043,7 @@ index 3ac7fb8df..249b3ed2d 100644
and self.observability_config.collect_model_execute_time):
output.tensors["model_execute_time"] = torch.tensor(
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 9cf253875..df6ab56c6 100644
index 9cf253875..34d098486 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -3,8 +3,8 @@ import time
Expand Down Expand Up @@ -39606,7 +39606,7 @@ index 9cf253875..df6ab56c6 100644

self.sampling_metadata_cache: SamplingMetadataCache = \
SamplingMetadataCache() \
@@ -415,10 +719,38 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
@@ -415,16 +719,74 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
logger.info("Loading model weights took %.4f GB",
self.model_memory_usage / float(2**30))

Expand Down Expand Up @@ -39643,12 +39643,19 @@ index 9cf253875..df6ab56c6 100644
+ return rope_scaling.get("type", None) == "mrope" or rope_scaling.get("mrope_section", None) is not None
+
@torch.inference_mode()
def profile_run(self) -> None:
- def profile_run(self) -> None:
+ def profile_run(self, num_batched_tokens=-1, num_seqs=-1) -> None:
# Enable top-k sampling to reflect the accurate memory usage.
@@ -426,6 +758,30 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
+ assert (num_batched_tokens == -1 or num_batched_tokens > 0)
+ assert (num_seqs == -1 or num_seqs > 0)
max_num_seqs = self.scheduler_config.max_num_seqs

+ if num_batched_tokens != -1:
+ max_num_batched_tokens = num_batched_tokens
+ if num_seqs != -1:
+ max_num_seqs = num_seqs
+
+ # This represents the maximum number of different requests
+ # that will have unique loras, an therefore the max amount of memory
+ # consumption create dummy lora request copies from the lora request
Expand All @@ -39672,27 +39679,10 @@ index 9cf253875..df6ab56c6 100644
+ dummy_lora_requests[idx % len(dummy_lora_requests)]
+ for idx in range(max_num_seqs)
+ ]
+
# Profile memory usage with max_num_sequences sequences and the total
# number of tokens equal to max_num_batched_tokens.
seqs: List[SequenceGroupMetadata] = []
@@ -450,6 +806,15 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
max_num_seqs = 1

batch_size = 0
+ import os
+ self_max_num_batched_tokens = os.getenv("IPEX_LLM_SELF_MAX_NUM_BATCHED_TOKENS", None)
+ if self_max_num_batched_tokens is not None:
+ max_num_batched_tokens = int(self_max_num_batched_tokens)
+ self_max_num_seqs = os.getenv("IPEX_LLM_SELF_MAX_NUM_SEQS", None)
+ if self_max_num_seqs is not None:
+ max_num_seqs = int(self_max_num_seqs)
+ else:
+ max_num_seqs = 1
for group_id in range(max_num_seqs):
seq_len = (max_num_batched_tokens // max_num_seqs +
(group_id < max_num_batched_tokens % max_num_seqs))
@@ -466,7 +831,8 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
@@ -466,7 +828,8 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
seq_data={group_id: dummy_data.seq_data},
sampling_params=sampling_params,
block_tables=None,
Expand All @@ -39702,7 +39692,7 @@ index 9cf253875..df6ab56c6 100644
multi_modal_data=dummy_data.multi_modal_data,
multi_modal_placeholders=dummy_data.multi_modal_placeholders)
seqs.append(seq)
@@ -477,9 +843,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
@@ -477,9 +840,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
# it by reference, rather by specializing on the value ``None``.
# the `dtype` argument does not matter, and we use `float32` as
# a placeholder (it has wide hardware support).
Expand All @@ -39713,7 +39703,7 @@ index 9cf253875..df6ab56c6 100644
finished_requests_ids = [seq.request_id for seq in seqs]
model_input = self.prepare_model_input(
seqs, finished_requests_ids=finished_requests_ids)
@@ -493,21 +857,35 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
@@ -493,21 +854,35 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
torch.xpu.synchronize()
return

Expand Down Expand Up @@ -39759,7 +39749,7 @@ index 9cf253875..df6ab56c6 100644
"""Helper method to prepare the model input based on a given sequence
group. Prepares metadata needed for the base model forward pass but not
metadata for possible additional steps, e.g., sampling.
@@ -519,6 +897,22 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
@@ -519,6 +894,22 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):

return builder.build() # type: ignore

Expand All @@ -39782,7 +39772,7 @@ index 9cf253875..df6ab56c6 100644
def prepare_model_input(
self,
seq_group_metadata_list: List[SequenceGroupMetadata],
@@ -558,6 +952,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
@@ -558,6 +949,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
raise ValueError(
"XPUModelRunner does not support multi-step execution.")

Expand All @@ -39795,7 +39785,7 @@ index 9cf253875..df6ab56c6 100644
model_executable = self.model
if (self.observability_config is not None
and self.observability_config.collect_model_forward_time):
@@ -607,3 +1007,9 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
@@ -607,3 +1004,9 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
output.model_forward_time = model_forward_time

return [output]
Expand Down Expand Up @@ -40700,7 +40690,7 @@ index 000000000..6ad951824
+ return model_input, worker_input, kwargs
\ No newline at end of file
diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
index 129566605..43d306145 100644
index 129566605..fb7962dfe 100644
--- a/vllm/worker/xpu_worker.py
+++ b/vllm/worker/xpu_worker.py
@@ -3,7 +3,8 @@ import gc
Expand Down Expand Up @@ -40729,15 +40719,75 @@ index 129566605..43d306145 100644
"""A worker class that executes (a partition of) the model on a GPU.

Each worker is associated with a single XPU device. The worker is
@@ -98,6 +99,7 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
@@ -97,16 +98,74 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
"""
# Profile the memory usage of the model and get the maximum number of
# cache blocks that can be allocated with the remaining free memory.
+ flag = int(os.getenv("IPEX_LLM_FIND_MAX_LENGTH", -1))
+ if flag != -1:
+ assert flag > 0
+ torch.xpu.empty_cache()
+ before_memory = torch.xpu.memory_reserved()
+ max_num_batched_tokens = flag
+ max_num_seqs = 1
+ support_input = []
+ support_kv_cache = []
+ while True:
+ print(f"Profiling with max_num_batched_tokens {max_num_batched_tokens}...")
+ self.model_runner.profile_run(max_num_batched_tokens, max_num_seqs)
+ torch.xpu.synchronize()
+ used_memory = torch.xpu.memory_reserved()
+ total_gpu_memory = torch.xpu.get_device_properties(
+ self.local_rank).total_memory
+ free_gpu_memory = total_gpu_memory - used_memory
+ peak_memory = self.init_gpu_memory - free_gpu_memory
+ assert peak_memory > 0
+ cache_block_size = self.get_cache_block_size_bytes()
+ num_gpu_blocks = int(
+ (total_gpu_memory * self.cache_config.gpu_memory_utilization -
+ peak_memory) // cache_block_size)
+ num_cpu_blocks = int(self.cache_config.swap_space_bytes //
+ cache_block_size)
+ num_gpu_blocks = max(num_gpu_blocks, 0)
+ num_cpu_blocks = max(num_cpu_blocks, 0)
+ gc.collect()
+ torch.xpu.empty_cache()
+ # Begin to handle data...
+ if num_gpu_blocks == 0:
+ break
+ kv_cache_support_length = num_gpu_blocks * self.cache_config.block_size
+ # Too long input...
+ if max_num_batched_tokens > kv_cache_support_length:
+ break
+ support_input.append(max_num_batched_tokens)
+ support_kv_cache.append(kv_cache_support_length)
+ max_num_batched_tokens += 250
+
+ print(f"Recommended max input length: {support_input[len(support_input) - 1]}")
+ print(f"{'input length':<15} {'kv cache length':<15}")
+ print("-" * 30)
+
+ for inp, kv in zip(support_input, support_kv_cache):
+ print(f"{inp:<15} {kv:<15}")
torch.xpu.empty_cache()
+ before_memory = torch.xpu.memory_reserved()

# Execute a forward pass with dummy inputs to profile the memory usage
# of the model.
@@ -106,7 +108,7 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
- self.model_runner.profile_run()
+ self_max_num_batched_tokens = os.getenv("IPEX_LLM_SELF_MAX_NUM_BATCHED_TOKENS", None)
+ if self_max_num_batched_tokens is not None:
+ # If this get set, then profile using max input length
+ max_num_batched_tokens = int(self_max_num_batched_tokens)
+ self_max_num_seqs = os.getenv("IPEX_LLM_SELF_MAX_NUM_SEQS", None)
+ if self_max_num_seqs is not None:
+ max_num_seqs = int(self_max_num_seqs)
+ else:
+ max_num_seqs = 1
+ self.model_runner.profile_run(max_num_batched_tokens, max_num_seqs)
+ else:
+ self.model_runner.profile_run()

# Calculate the number of blocks that can be allocated with the
# profiled peak memory.
torch.xpu.synchronize()
Expand All @@ -40746,7 +40796,7 @@ index 129566605..43d306145 100644
total_gpu_memory = torch.xpu.get_device_properties(
self.local_rank).total_memory
free_gpu_memory = total_gpu_memory - used_memory
@@ -130,6 +132,20 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
@@ -130,6 +189,20 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
num_cpu_blocks = max(num_cpu_blocks, 0)
gc.collect()
torch.xpu.empty_cache()
Expand All @@ -40767,7 +40817,7 @@ index 129566605..43d306145 100644
return num_gpu_blocks, num_cpu_blocks

def _warm_up_model(self) -> None:
@@ -175,4 +191,10 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
@@ -175,4 +248,10 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
parallel_config.tensor_parallel_size,
parallel_config.pipeline_parallel_size)
# global all_reduce needed for overall oneccl warm up
Expand Down