51
51
LoRA ,
52
52
)
53
53
from .. import LLM , LLMFamilyV1 , LLMSpecV1
54
- from ..llm_family import CustomLLMFamilyV1
54
+ from ..llm_family import CustomLLMFamilyV1 , cache_model_tokenizer_and_config
55
55
from ..utils import (
56
56
DEEPSEEK_TOOL_CALL_FAMILY ,
57
57
QWEN_TOOL_CALL_FAMILY ,
@@ -331,8 +331,10 @@ def load(self):
331
331
332
332
raise ImportError (f"{ error_message } \n \n { '' .join (installation_guide )} " )
333
333
334
- if vllm .__version__ >= "0.3.1" :
335
- # from vllm v0.3.1, it uses cupy as NCCL backend
334
+ from ..llm_family import LlamaCppLLMSpecV1
335
+
336
+ if "0.3.1" <= vllm .__version__ <= "0.3.3" :
337
+ # from vllm v0.3.1 to v0.3.3, it uses cupy as NCCL backend
336
338
# in which cupy will fork a process
337
339
# only for xoscar >= 0.3.0, new process is allowed in subpool
338
340
# besides, xinference set start method as forkserver for unix
@@ -345,6 +347,13 @@ def load(self):
345
347
346
348
self .prepare_parse_reasoning_content (reasoning_content )
347
349
350
+ if (
351
+ isinstance (self .model_spec , LlamaCppLLMSpecV1 )
352
+ and self .model_spec .model_format == "ggufv2"
353
+ ):
354
+ # gguf
355
+ self ._preprocess_load_gguf ()
356
+
348
357
if self .lora_modules is None :
349
358
self .lora_requests = []
350
359
else :
@@ -483,6 +492,45 @@ def wait_for_load(self):
483
492
_ , err , tb = self ._loading_error
484
493
raise err .with_traceback (tb )
485
494
495
+ def _preprocess_load_gguf (self ):
496
+ # check if it is multi gguf files
497
+ if (
498
+ not os .path .isfile (self .model_path )
499
+ and self .model_spec .quantization_parts
500
+ and self .quantization in self .model_spec .quantization_parts
501
+ ):
502
+ raise RuntimeError (
503
+ "vllm does not support multiple gguf files, please merge them first and "
504
+ "provide `model_path` with merged file"
505
+ )
506
+
507
+ if "tokenizer" not in self ._model_config :
508
+ # find pytorch format without quantization
509
+ non_quant_spec = next (
510
+ spec
511
+ for spec in self .model_family .model_specs
512
+ if spec .model_format == "pytorch"
513
+ and "none" in spec .quantizations
514
+ and spec .model_size_in_billions
515
+ == self .model_spec .model_size_in_billions
516
+ )
517
+
518
+ path = cache_model_tokenizer_and_config (self .model_family , non_quant_spec )
519
+ # other than gguf file, vllm requires to provide tokenizer and hf_config_path
520
+ self ._model_config ["tokenizer" ] = self ._model_config [
521
+ "hf_config_path"
522
+ ] = path
523
+
524
+ if not os .path .isfile (self .model_path ):
525
+ self .model_path = os .path .realpath (
526
+ os .path .join (
527
+ self .model_path ,
528
+ self .model_spec .model_file_name_template .format (
529
+ quantization = self .quantization
530
+ ),
531
+ )
532
+ )
533
+
486
534
def stop (self ):
487
535
# though the vLLM engine will shutdown when deleted,
488
536
# but some issue e.g. GH#1682 reported
@@ -943,7 +991,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
943
991
def match_json (
944
992
cls , llm_family : "LLMFamilyV1" , llm_spec : "LLMSpecV1" , quantization : str
945
993
) -> bool :
946
- if llm_spec .model_format not in ["pytorch" , "gptq" , "awq" , "fp8" ]:
994
+ if llm_spec .model_format not in ["pytorch" , "gptq" , "awq" , "fp8" , "ggufv2" ]:
947
995
return False
948
996
if llm_spec .model_format == "pytorch" :
949
997
if quantization != "none" and not (quantization is None ):
@@ -959,6 +1007,9 @@ def match_json(
959
1007
else :
960
1008
if "4" not in quantization :
961
1009
return False
1010
+ if llm_spec .model_format == "ggufv2" :
1011
+ if not (VLLM_INSTALLED and vllm .__version__ >= "0.8.2" ):
1012
+ return False
962
1013
if isinstance (llm_family , CustomLLMFamilyV1 ):
963
1014
if llm_family .model_family not in VLLM_SUPPORTED_CHAT_MODELS :
964
1015
return False
0 commit comments