Skip to content

Commit 3ee6dec

Browse files
authored
update vllm patch (#13064)
1 parent 1d7f4a8 commit 3ee6dec

File tree

1 file changed

+84
-34
lines changed

1 file changed

+84
-34
lines changed

docker/llm/serving/xpu/docker/vllm_for_multi_arc.patch

+84-34
Original file line numberDiff line numberDiff line change
@@ -39043,7 +39043,7 @@ index 3ac7fb8df..249b3ed2d 100644
3904339043
and self.observability_config.collect_model_execute_time):
3904439044
output.tensors["model_execute_time"] = torch.tensor(
3904539045
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
39046-
index 9cf253875..df6ab56c6 100644
39046+
index 9cf253875..34d098486 100644
3904739047
--- a/vllm/worker/xpu_model_runner.py
3904839048
+++ b/vllm/worker/xpu_model_runner.py
3904939049
@@ -3,8 +3,8 @@ import time
@@ -39606,7 +39606,7 @@ index 9cf253875..df6ab56c6 100644
3960639606

3960739607
self.sampling_metadata_cache: SamplingMetadataCache = \
3960839608
SamplingMetadataCache() \
39609-
@@ -415,10 +719,38 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
39609+
@@ -415,16 +719,74 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
3961039610
logger.info("Loading model weights took %.4f GB",
3961139611
self.model_memory_usage / float(2**30))
3961239612

@@ -39643,12 +39643,19 @@ index 9cf253875..df6ab56c6 100644
3964339643
+ return rope_scaling.get("type", None) == "mrope" or rope_scaling.get("mrope_section", None) is not None
3964439644
+
3964539645
@torch.inference_mode()
39646-
def profile_run(self) -> None:
39646+
- def profile_run(self) -> None:
39647+
+ def profile_run(self, num_batched_tokens=-1, num_seqs=-1) -> None:
3964739648
# Enable top-k sampling to reflect the accurate memory usage.
39648-
@@ -426,6 +758,30 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
39649+
sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
3964939650
max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
39651+
+ assert (num_batched_tokens == -1 or num_batched_tokens > 0)
39652+
+ assert (num_seqs == -1 or num_seqs > 0)
3965039653
max_num_seqs = self.scheduler_config.max_num_seqs
39651-
39654+
+ if num_batched_tokens != -1:
39655+
+ max_num_batched_tokens = num_batched_tokens
39656+
+ if num_seqs != -1:
39657+
+ max_num_seqs = num_seqs
39658+
+
3965239659
+ # This represents the maximum number of different requests
3965339660
+ # that will have unique loras, an therefore the max amount of memory
3965439661
+ # consumption create dummy lora request copies from the lora request
@@ -39672,27 +39679,10 @@ index 9cf253875..df6ab56c6 100644
3967239679
+ dummy_lora_requests[idx % len(dummy_lora_requests)]
3967339680
+ for idx in range(max_num_seqs)
3967439681
+ ]
39675-
+
39682+
3967639683
# Profile memory usage with max_num_sequences sequences and the total
3967739684
# number of tokens equal to max_num_batched_tokens.
39678-
seqs: List[SequenceGroupMetadata] = []
39679-
@@ -450,6 +806,15 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
39680-
max_num_seqs = 1
39681-
39682-
batch_size = 0
39683-
+ import os
39684-
+ self_max_num_batched_tokens = os.getenv("IPEX_LLM_SELF_MAX_NUM_BATCHED_TOKENS", None)
39685-
+ if self_max_num_batched_tokens is not None:
39686-
+ max_num_batched_tokens = int(self_max_num_batched_tokens)
39687-
+ self_max_num_seqs = os.getenv("IPEX_LLM_SELF_MAX_NUM_SEQS", None)
39688-
+ if self_max_num_seqs is not None:
39689-
+ max_num_seqs = int(self_max_num_seqs)
39690-
+ else:
39691-
+ max_num_seqs = 1
39692-
for group_id in range(max_num_seqs):
39693-
seq_len = (max_num_batched_tokens // max_num_seqs +
39694-
(group_id < max_num_batched_tokens % max_num_seqs))
39695-
@@ -466,7 +831,8 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
39685+
@@ -466,7 +828,8 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
3969639686
seq_data={group_id: dummy_data.seq_data},
3969739687
sampling_params=sampling_params,
3969839688
block_tables=None,
@@ -39702,7 +39692,7 @@ index 9cf253875..df6ab56c6 100644
3970239692
multi_modal_data=dummy_data.multi_modal_data,
3970339693
multi_modal_placeholders=dummy_data.multi_modal_placeholders)
3970439694
seqs.append(seq)
39705-
@@ -477,9 +843,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
39695+
@@ -477,9 +840,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
3970639696
# it by reference, rather by specializing on the value ``None``.
3970739697
# the `dtype` argument does not matter, and we use `float32` as
3970839698
# a placeholder (it has wide hardware support).
@@ -39713,7 +39703,7 @@ index 9cf253875..df6ab56c6 100644
3971339703
finished_requests_ids = [seq.request_id for seq in seqs]
3971439704
model_input = self.prepare_model_input(
3971539705
seqs, finished_requests_ids=finished_requests_ids)
39716-
@@ -493,21 +857,35 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
39706+
@@ -493,21 +854,35 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
3971739707
torch.xpu.synchronize()
3971839708
return
3971939709

@@ -39759,7 +39749,7 @@ index 9cf253875..df6ab56c6 100644
3975939749
"""Helper method to prepare the model input based on a given sequence
3976039750
group. Prepares metadata needed for the base model forward pass but not
3976139751
metadata for possible additional steps, e.g., sampling.
39762-
@@ -519,6 +897,22 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
39752+
@@ -519,6 +894,22 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
3976339753

3976439754
return builder.build() # type: ignore
3976539755

@@ -39782,7 +39772,7 @@ index 9cf253875..df6ab56c6 100644
3978239772
def prepare_model_input(
3978339773
self,
3978439774
seq_group_metadata_list: List[SequenceGroupMetadata],
39785-
@@ -558,6 +952,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
39775+
@@ -558,6 +949,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
3978639776
raise ValueError(
3978739777
"XPUModelRunner does not support multi-step execution.")
3978839778

@@ -39795,7 +39785,7 @@ index 9cf253875..df6ab56c6 100644
3979539785
model_executable = self.model
3979639786
if (self.observability_config is not None
3979739787
and self.observability_config.collect_model_forward_time):
39798-
@@ -607,3 +1007,9 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
39788+
@@ -607,3 +1004,9 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
3979939789
output.model_forward_time = model_forward_time
3980039790

3980139791
return [output]
@@ -40700,7 +40690,7 @@ index 000000000..6ad951824
4070040690
+ return model_input, worker_input, kwargs
4070140691
\ No newline at end of file
4070240692
diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
40703-
index 129566605..43d306145 100644
40693+
index 129566605..fb7962dfe 100644
4070440694
--- a/vllm/worker/xpu_worker.py
4070540695
+++ b/vllm/worker/xpu_worker.py
4070640696
@@ -3,7 +3,8 @@ import gc
@@ -40729,15 +40719,75 @@ index 129566605..43d306145 100644
4072940719
"""A worker class that executes (a partition of) the model on a GPU.
4073040720

4073140721
Each worker is associated with a single XPU device. The worker is
40732-
@@ -98,6 +99,7 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
40722+
@@ -97,16 +98,74 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
40723+
"""
4073340724
# Profile the memory usage of the model and get the maximum number of
4073440725
# cache blocks that can be allocated with the remaining free memory.
40726+
+ flag = int(os.getenv("IPEX_LLM_FIND_MAX_LENGTH", -1))
40727+
+ if flag != -1:
40728+
+ assert flag > 0
40729+
+ torch.xpu.empty_cache()
40730+
+ before_memory = torch.xpu.memory_reserved()
40731+
+ max_num_batched_tokens = flag
40732+
+ max_num_seqs = 1
40733+
+ support_input = []
40734+
+ support_kv_cache = []
40735+
+ while True:
40736+
+ print(f"Profiling with max_num_batched_tokens {max_num_batched_tokens}...")
40737+
+ self.model_runner.profile_run(max_num_batched_tokens, max_num_seqs)
40738+
+ torch.xpu.synchronize()
40739+
+ used_memory = torch.xpu.memory_reserved()
40740+
+ total_gpu_memory = torch.xpu.get_device_properties(
40741+
+ self.local_rank).total_memory
40742+
+ free_gpu_memory = total_gpu_memory - used_memory
40743+
+ peak_memory = self.init_gpu_memory - free_gpu_memory
40744+
+ assert peak_memory > 0
40745+
+ cache_block_size = self.get_cache_block_size_bytes()
40746+
+ num_gpu_blocks = int(
40747+
+ (total_gpu_memory * self.cache_config.gpu_memory_utilization -
40748+
+ peak_memory) // cache_block_size)
40749+
+ num_cpu_blocks = int(self.cache_config.swap_space_bytes //
40750+
+ cache_block_size)
40751+
+ num_gpu_blocks = max(num_gpu_blocks, 0)
40752+
+ num_cpu_blocks = max(num_cpu_blocks, 0)
40753+
+ gc.collect()
40754+
+ torch.xpu.empty_cache()
40755+
+ # Begin to handle data...
40756+
+ if num_gpu_blocks == 0:
40757+
+ break
40758+
+ kv_cache_support_length = num_gpu_blocks * self.cache_config.block_size
40759+
+ # Too long input...
40760+
+ if max_num_batched_tokens > kv_cache_support_length:
40761+
+ break
40762+
+ support_input.append(max_num_batched_tokens)
40763+
+ support_kv_cache.append(kv_cache_support_length)
40764+
+ max_num_batched_tokens += 250
40765+
+
40766+
+ print(f"Recommended max input length: {support_input[len(support_input) - 1]}")
40767+
+ print(f"{'input length':<15} {'kv cache length':<15}")
40768+
+ print("-" * 30)
40769+
+
40770+
+ for inp, kv in zip(support_input, support_kv_cache):
40771+
+ print(f"{inp:<15} {kv:<15}")
4073540772
torch.xpu.empty_cache()
4073640773
+ before_memory = torch.xpu.memory_reserved()
4073740774

4073840775
# Execute a forward pass with dummy inputs to profile the memory usage
4073940776
# of the model.
40740-
@@ -106,7 +108,7 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
40777+
- self.model_runner.profile_run()
40778+
+ self_max_num_batched_tokens = os.getenv("IPEX_LLM_SELF_MAX_NUM_BATCHED_TOKENS", None)
40779+
+ if self_max_num_batched_tokens is not None:
40780+
+ # If this get set, then profile using max input length
40781+
+ max_num_batched_tokens = int(self_max_num_batched_tokens)
40782+
+ self_max_num_seqs = os.getenv("IPEX_LLM_SELF_MAX_NUM_SEQS", None)
40783+
+ if self_max_num_seqs is not None:
40784+
+ max_num_seqs = int(self_max_num_seqs)
40785+
+ else:
40786+
+ max_num_seqs = 1
40787+
+ self.model_runner.profile_run(max_num_batched_tokens, max_num_seqs)
40788+
+ else:
40789+
+ self.model_runner.profile_run()
40790+
4074140791
# Calculate the number of blocks that can be allocated with the
4074240792
# profiled peak memory.
4074340793
torch.xpu.synchronize()
@@ -40746,7 +40796,7 @@ index 129566605..43d306145 100644
4074640796
total_gpu_memory = torch.xpu.get_device_properties(
4074740797
self.local_rank).total_memory
4074840798
free_gpu_memory = total_gpu_memory - used_memory
40749-
@@ -130,6 +132,20 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
40799+
@@ -130,6 +189,20 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
4075040800
num_cpu_blocks = max(num_cpu_blocks, 0)
4075140801
gc.collect()
4075240802
torch.xpu.empty_cache()
@@ -40767,7 +40817,7 @@ index 129566605..43d306145 100644
4076740817
return num_gpu_blocks, num_cpu_blocks
4076840818

4076940819
def _warm_up_model(self) -> None:
40770-
@@ -175,4 +191,10 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
40820+
@@ -175,4 +248,10 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
4077140821
parallel_config.tensor_parallel_size,
4077240822
parallel_config.pipeline_parallel_size)
4077340823
# global all_reduce needed for overall oneccl warm up

0 commit comments

Comments
 (0)