Skip to content

Commit da3e5f5

Browse files
authored
Merge pull request #171 from runpod-workers/update
update vllm 0.8.2
2 parents fa31d96 + acbdf63 commit da3e5f5

File tree

3 files changed

+65
-4
lines changed

3 files changed

+65
-4
lines changed

Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
1212
python3 -m pip install --upgrade -r /requirements.txt
1313

1414
# Install vLLM (switching back to pip installs since issues that required building fork are fixed and space optimization is not as important since caching) and FlashInfer
15-
RUN python3 -m pip install vllm==0.7.3 && \
15+
RUN python3 -m pip install vllm==0.8.2 && \
1616
python3 -m pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3
1717

1818
# Setup for Option 2: Building the Image with the Model included

README.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@ Deploy OpenAI-Compatible Blazing-Fast LLM Endpoints powered by the [vLLM](https:
1818
### 1. UI for Deploying vLLM Worker on RunPod console:
1919
![Demo of Deploying vLLM Worker on RunPod console with new UI](media/ui_demo.gif)
2020

21-
### 2. Worker vLLM `v2.1.0` with vLLM `0.7.3` now available under `stable` tags
21+
### 2. Worker vLLM `v2.2.0` with vLLM `0.8.2` now available under `stable` tags
2222

23-
Update v2.0.0 is now available, use the image tag `runpod/worker-v1-vllm:v2.1.0stable-cuda12.1.0`.
23+
Update v2.2.0 is now available, use the image tag `runpod/worker-v1-vllm:v2.2.0stable-cuda12.1.0`.
2424

2525
### 3. OpenAI-Compatible [Embedding Worker](https://github.com/runpod-workers/worker-infinity-embedding) Released
2626
Deploy your own OpenAI-compatible Serverless Endpoint on RunPod with multiple embedding models and fast inference for RAG and more!
@@ -82,7 +82,7 @@ Below is a summary of the available RunPod Worker images, categorized by image s
8282

8383
| CUDA Version | Stable Image Tag | Development Image Tag | Note |
8484
|--------------|-----------------------------------|-----------------------------------|----------------------------------------------------------------------|
85-
| 12.1.0 | `runpod/worker-v1-vllm:v2.1.0stable-cuda12.1.0` | `runpod/worker-v1-vllm:v2.1.0dev-cuda12.1.0` | When creating an Endpoint, select CUDA Version 12.3, 12.2 and 12.1 in the filter. |
85+
| 12.1.0 | `runpod/worker-v1-vllm:v2.2.0stable-cuda12.1.0` | `runpod/worker-v1-vllm:v2.2.0dev-cuda12.1.0` | When creating an Endpoint, select CUDA Version 12.3, 12.2 and 12.1 in the filter. |
8686

8787

8888

worker-config.json

+61
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,66 @@
11
{
22
"versions": {
3+
"0.8.2": {
4+
"imageName": "runpod/worker-v1-vllm:v2.2.0stable-cuda12.1.0",
5+
"minimumCudaVersion": "12.1",
6+
"categories": [
7+
{
8+
"title": "LLM Settings",
9+
"settings": [
10+
"TOKENIZER", "TOKENIZER_MODE", "SKIP_TOKENIZER_INIT", "TRUST_REMOTE_CODE",
11+
"DOWNLOAD_DIR", "LOAD_FORMAT", "DTYPE", "KV_CACHE_DTYPE", "QUANTIZATION_PARAM_PATH",
12+
"MAX_MODEL_LEN", "GUIDED_DECODING_BACKEND", "DISTRIBUTED_EXECUTOR_BACKEND",
13+
"WORKER_USE_RAY", "RAY_WORKERS_USE_NSIGHT", "PIPELINE_PARALLEL_SIZE",
14+
"TENSOR_PARALLEL_SIZE", "MAX_PARALLEL_LOADING_WORKERS", "ENABLE_PREFIX_CACHING",
15+
"DISABLE_SLIDING_WINDOW", "NUM_LOOKAHEAD_SLOTS",
16+
"SEED", "NUM_GPU_BLOCKS_OVERRIDE", "MAX_NUM_BATCHED_TOKENS", "MAX_NUM_SEQS",
17+
"MAX_LOGPROBS", "DISABLE_LOG_STATS", "QUANTIZATION", "ROPE_SCALING", "ROPE_THETA",
18+
"TOKENIZER_POOL_SIZE", "TOKENIZER_POOL_TYPE", "TOKENIZER_POOL_EXTRA_CONFIG",
19+
"ENABLE_LORA", "MAX_LORAS", "MAX_LORA_RANK", "LORA_EXTRA_VOCAB_SIZE",
20+
"LORA_DTYPE", "LONG_LORA_SCALING_FACTORS", "MAX_CPU_LORAS", "FULLY_SHARDED_LORAS",
21+
"DEVICE", "SCHEDULER_DELAY_FACTOR", "ENABLE_CHUNKED_PREFILL", "SPECULATIVE_MODEL",
22+
"NUM_SPECULATIVE_TOKENS", "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE",
23+
"SPECULATIVE_MAX_MODEL_LEN", "SPECULATIVE_DISABLE_BY_BATCH_SIZE",
24+
"NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD",
25+
"TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA",
26+
"MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD",
27+
"PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST",
28+
"ENABLE_AUTO_TOOL_CHOICE", "TOOL_CALL_PARSER"
29+
]
30+
},
31+
{
32+
"title": "Tokenizer Settings",
33+
"settings": [
34+
"TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE"
35+
]
36+
},
37+
{
38+
"title": "System Settings",
39+
"settings": [
40+
"GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE",
41+
"SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE"
42+
]
43+
},
44+
{
45+
"title": "Streaming Settings",
46+
"settings": [
47+
"DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR"
48+
]
49+
},
50+
{
51+
"title": "OpenAI Settings",
52+
"settings": [
53+
"RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE"
54+
]
55+
},
56+
{
57+
"title": "Serverless Settings",
58+
"settings": [
59+
"MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS"
60+
]
61+
}
62+
]
63+
},
364
"0.7.3": {
465
"imageName": "runpod/worker-v1-vllm:v2.1.0stable-cuda12.1.0",
566
"minimumCudaVersion": "12.1",

0 commit comments

Comments
 (0)