@@ -39043,7 +39043,7 @@ index 3ac7fb8df..249b3ed2d 100644
39043
39043
and self.observability_config.collect_model_execute_time):
39044
39044
output.tensors["model_execute_time"] = torch.tensor(
39045
39045
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
39046
- index 9cf253875..df6ab56c6 100644
39046
+ index 9cf253875..34d098486 100644
39047
39047
--- a/vllm/worker/xpu_model_runner.py
39048
39048
+++ b/vllm/worker/xpu_model_runner.py
39049
39049
@@ -3,8 +3,8 @@ import time
@@ -39606,7 +39606,7 @@ index 9cf253875..df6ab56c6 100644
39606
39606
39607
39607
self.sampling_metadata_cache: SamplingMetadataCache = \
39608
39608
SamplingMetadataCache() \
39609
- @@ -415,10 +719,38 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
39609
+ @@ -415,16 +719,74 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
39610
39610
logger.info("Loading model weights took %.4f GB",
39611
39611
self.model_memory_usage / float(2**30))
39612
39612
@@ -39643,12 +39643,19 @@ index 9cf253875..df6ab56c6 100644
39643
39643
+ return rope_scaling.get("type", None) == "mrope" or rope_scaling.get("mrope_section", None) is not None
39644
39644
+
39645
39645
@torch.inference_mode()
39646
- def profile_run(self) -> None:
39646
+ - def profile_run(self) -> None:
39647
+ + def profile_run(self, num_batched_tokens=-1, num_seqs=-1) -> None:
39647
39648
# Enable top-k sampling to reflect the accurate memory usage.
39648
- @@ -426,6 +758,30 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
39649
+ sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
39649
39650
max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
39651
+ + assert (num_batched_tokens == -1 or num_batched_tokens > 0)
39652
+ + assert (num_seqs == -1 or num_seqs > 0)
39650
39653
max_num_seqs = self.scheduler_config.max_num_seqs
39651
-
39654
+ + if num_batched_tokens != -1:
39655
+ + max_num_batched_tokens = num_batched_tokens
39656
+ + if num_seqs != -1:
39657
+ + max_num_seqs = num_seqs
39658
+ +
39652
39659
+ # This represents the maximum number of different requests
39653
39660
+ # that will have unique loras, an therefore the max amount of memory
39654
39661
+ # consumption create dummy lora request copies from the lora request
@@ -39672,27 +39679,10 @@ index 9cf253875..df6ab56c6 100644
39672
39679
+ dummy_lora_requests[idx % len(dummy_lora_requests)]
39673
39680
+ for idx in range(max_num_seqs)
39674
39681
+ ]
39675
- +
39682
+
39676
39683
# Profile memory usage with max_num_sequences sequences and the total
39677
39684
# number of tokens equal to max_num_batched_tokens.
39678
- seqs: List[SequenceGroupMetadata] = []
39679
- @@ -450,6 +806,15 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
39680
- max_num_seqs = 1
39681
-
39682
- batch_size = 0
39683
- + import os
39684
- + self_max_num_batched_tokens = os.getenv("IPEX_LLM_SELF_MAX_NUM_BATCHED_TOKENS", None)
39685
- + if self_max_num_batched_tokens is not None:
39686
- + max_num_batched_tokens = int(self_max_num_batched_tokens)
39687
- + self_max_num_seqs = os.getenv("IPEX_LLM_SELF_MAX_NUM_SEQS", None)
39688
- + if self_max_num_seqs is not None:
39689
- + max_num_seqs = int(self_max_num_seqs)
39690
- + else:
39691
- + max_num_seqs = 1
39692
- for group_id in range(max_num_seqs):
39693
- seq_len = (max_num_batched_tokens // max_num_seqs +
39694
- (group_id < max_num_batched_tokens % max_num_seqs))
39695
- @@ -466,7 +831,8 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
39685
+ @@ -466,7 +828,8 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
39696
39686
seq_data={group_id: dummy_data.seq_data},
39697
39687
sampling_params=sampling_params,
39698
39688
block_tables=None,
@@ -39702,7 +39692,7 @@ index 9cf253875..df6ab56c6 100644
39702
39692
multi_modal_data=dummy_data.multi_modal_data,
39703
39693
multi_modal_placeholders=dummy_data.multi_modal_placeholders)
39704
39694
seqs.append(seq)
39705
- @@ -477,9 +843 ,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
39695
+ @@ -477,9 +840 ,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
39706
39696
# it by reference, rather by specializing on the value ``None``.
39707
39697
# the `dtype` argument does not matter, and we use `float32` as
39708
39698
# a placeholder (it has wide hardware support).
@@ -39713,7 +39703,7 @@ index 9cf253875..df6ab56c6 100644
39713
39703
finished_requests_ids = [seq.request_id for seq in seqs]
39714
39704
model_input = self.prepare_model_input(
39715
39705
seqs, finished_requests_ids=finished_requests_ids)
39716
- @@ -493,21 +857 ,35 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
39706
+ @@ -493,21 +854 ,35 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
39717
39707
torch.xpu.synchronize()
39718
39708
return
39719
39709
@@ -39759,7 +39749,7 @@ index 9cf253875..df6ab56c6 100644
39759
39749
"""Helper method to prepare the model input based on a given sequence
39760
39750
group. Prepares metadata needed for the base model forward pass but not
39761
39751
metadata for possible additional steps, e.g., sampling.
39762
- @@ -519,6 +897 ,22 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
39752
+ @@ -519,6 +894 ,22 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
39763
39753
39764
39754
return builder.build() # type: ignore
39765
39755
@@ -39782,7 +39772,7 @@ index 9cf253875..df6ab56c6 100644
39782
39772
def prepare_model_input(
39783
39773
self,
39784
39774
seq_group_metadata_list: List[SequenceGroupMetadata],
39785
- @@ -558,6 +952 ,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
39775
+ @@ -558,6 +949 ,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
39786
39776
raise ValueError(
39787
39777
"XPUModelRunner does not support multi-step execution.")
39788
39778
@@ -39795,7 +39785,7 @@ index 9cf253875..df6ab56c6 100644
39795
39785
model_executable = self.model
39796
39786
if (self.observability_config is not None
39797
39787
and self.observability_config.collect_model_forward_time):
39798
- @@ -607,3 +1007 ,9 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
39788
+ @@ -607,3 +1004 ,9 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
39799
39789
output.model_forward_time = model_forward_time
39800
39790
39801
39791
return [output]
@@ -40700,7 +40690,7 @@ index 000000000..6ad951824
40700
40690
+ return model_input, worker_input, kwargs
40701
40691
\ No newline at end of file
40702
40692
diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
40703
- index 129566605..43d306145 100644
40693
+ index 129566605..fb7962dfe 100644
40704
40694
--- a/vllm/worker/xpu_worker.py
40705
40695
+++ b/vllm/worker/xpu_worker.py
40706
40696
@@ -3,7 +3,8 @@ import gc
@@ -40729,15 +40719,75 @@ index 129566605..43d306145 100644
40729
40719
"""A worker class that executes (a partition of) the model on a GPU.
40730
40720
40731
40721
Each worker is associated with a single XPU device. The worker is
40732
- @@ -98,6 +99,7 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
40722
+ @@ -97,16 +98,74 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
40723
+ """
40733
40724
# Profile the memory usage of the model and get the maximum number of
40734
40725
# cache blocks that can be allocated with the remaining free memory.
40726
+ + flag = int(os.getenv("IPEX_LLM_FIND_MAX_LENGTH", -1))
40727
+ + if flag != -1:
40728
+ + assert flag > 0
40729
+ + torch.xpu.empty_cache()
40730
+ + before_memory = torch.xpu.memory_reserved()
40731
+ + max_num_batched_tokens = flag
40732
+ + max_num_seqs = 1
40733
+ + support_input = []
40734
+ + support_kv_cache = []
40735
+ + while True:
40736
+ + print(f"Profiling with max_num_batched_tokens {max_num_batched_tokens}...")
40737
+ + self.model_runner.profile_run(max_num_batched_tokens, max_num_seqs)
40738
+ + torch.xpu.synchronize()
40739
+ + used_memory = torch.xpu.memory_reserved()
40740
+ + total_gpu_memory = torch.xpu.get_device_properties(
40741
+ + self.local_rank).total_memory
40742
+ + free_gpu_memory = total_gpu_memory - used_memory
40743
+ + peak_memory = self.init_gpu_memory - free_gpu_memory
40744
+ + assert peak_memory > 0
40745
+ + cache_block_size = self.get_cache_block_size_bytes()
40746
+ + num_gpu_blocks = int(
40747
+ + (total_gpu_memory * self.cache_config.gpu_memory_utilization -
40748
+ + peak_memory) // cache_block_size)
40749
+ + num_cpu_blocks = int(self.cache_config.swap_space_bytes //
40750
+ + cache_block_size)
40751
+ + num_gpu_blocks = max(num_gpu_blocks, 0)
40752
+ + num_cpu_blocks = max(num_cpu_blocks, 0)
40753
+ + gc.collect()
40754
+ + torch.xpu.empty_cache()
40755
+ + # Begin to handle data...
40756
+ + if num_gpu_blocks == 0:
40757
+ + break
40758
+ + kv_cache_support_length = num_gpu_blocks * self.cache_config.block_size
40759
+ + # Too long input...
40760
+ + if max_num_batched_tokens > kv_cache_support_length:
40761
+ + break
40762
+ + support_input.append(max_num_batched_tokens)
40763
+ + support_kv_cache.append(kv_cache_support_length)
40764
+ + max_num_batched_tokens += 250
40765
+ +
40766
+ + print(f"Recommended max input length: {support_input[len(support_input) - 1]}")
40767
+ + print(f"{'input length':<15} {'kv cache length':<15}")
40768
+ + print("-" * 30)
40769
+ +
40770
+ + for inp, kv in zip(support_input, support_kv_cache):
40771
+ + print(f"{inp:<15} {kv:<15}")
40735
40772
torch.xpu.empty_cache()
40736
40773
+ before_memory = torch.xpu.memory_reserved()
40737
40774
40738
40775
# Execute a forward pass with dummy inputs to profile the memory usage
40739
40776
# of the model.
40740
- @@ -106,7 +108,7 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
40777
+ - self.model_runner.profile_run()
40778
+ + self_max_num_batched_tokens = os.getenv("IPEX_LLM_SELF_MAX_NUM_BATCHED_TOKENS", None)
40779
+ + if self_max_num_batched_tokens is not None:
40780
+ + # If this get set, then profile using max input length
40781
+ + max_num_batched_tokens = int(self_max_num_batched_tokens)
40782
+ + self_max_num_seqs = os.getenv("IPEX_LLM_SELF_MAX_NUM_SEQS", None)
40783
+ + if self_max_num_seqs is not None:
40784
+ + max_num_seqs = int(self_max_num_seqs)
40785
+ + else:
40786
+ + max_num_seqs = 1
40787
+ + self.model_runner.profile_run(max_num_batched_tokens, max_num_seqs)
40788
+ + else:
40789
+ + self.model_runner.profile_run()
40790
+
40741
40791
# Calculate the number of blocks that can be allocated with the
40742
40792
# profiled peak memory.
40743
40793
torch.xpu.synchronize()
@@ -40746,7 +40796,7 @@ index 129566605..43d306145 100644
40746
40796
total_gpu_memory = torch.xpu.get_device_properties(
40747
40797
self.local_rank).total_memory
40748
40798
free_gpu_memory = total_gpu_memory - used_memory
40749
- @@ -130,6 +132 ,20 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
40799
+ @@ -130,6 +189 ,20 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
40750
40800
num_cpu_blocks = max(num_cpu_blocks, 0)
40751
40801
gc.collect()
40752
40802
torch.xpu.empty_cache()
@@ -40767,7 +40817,7 @@ index 129566605..43d306145 100644
40767
40817
return num_gpu_blocks, num_cpu_blocks
40768
40818
40769
40819
def _warm_up_model(self) -> None:
40770
- @@ -175,4 +191 ,10 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
40820
+ @@ -175,4 +248 ,10 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
40771
40821
parallel_config.tensor_parallel_size,
40772
40822
parallel_config.pipeline_parallel_size)
40773
40823
# global all_reduce needed for overall oneccl warm up
0 commit comments