Skip to content

Commit aa4d536

Browse files
harryzwhharryzwhqinxuye
authored
FEAT: add ggufv2 support for vLLM (#3259)
Co-authored-by: harryzwh <[email protected]> Co-authored-by: qinxuye <[email protected]>
1 parent a37da2e commit aa4d536

File tree

3 files changed

+109
-8
lines changed

3 files changed

+109
-8
lines changed

xinference/model/llm/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@
5757

5858
def check_format_with_engine(model_format, engine):
5959
# only llama-cpp-python support and only support ggufv2
60-
if model_format in ["ggufv2"] and engine != "llama.cpp":
60+
if model_format in ["ggufv2"] and engine not in ["llama.cpp", "vLLM"]:
6161
return False
6262
if model_format not in ["ggufv2"] and engine == "llama.cpp":
6363
return False

xinference/model/llm/llm_family.py

+53-3
Original file line numberDiff line numberDiff line change
@@ -370,14 +370,61 @@ def cache_from_uri(
370370
raise ValueError(f"Unsupported URL scheme: {src_scheme}")
371371

372372

373+
def cache_model_tokenizer_and_config(
374+
llm_family: LLMFamilyV1,
375+
llm_spec: "LLMSpecV1",
376+
) -> str:
377+
"""
378+
Download model config.json and tokenizers only
379+
"""
380+
cache_dir = _get_cache_dir_for_model_mem(llm_family, llm_spec, "tokenizer_config")
381+
os.makedirs(cache_dir, exist_ok=True)
382+
if llm_spec.model_hub == "huggingface":
383+
from huggingface_hub import snapshot_download
384+
385+
download_dir = retry_download(
386+
snapshot_download,
387+
llm_family.model_name,
388+
{
389+
"model_size": llm_spec.model_size_in_billions,
390+
"model_format": llm_spec.model_format,
391+
},
392+
llm_spec.model_id,
393+
revision=llm_spec.model_revision,
394+
allow_patterns=["tokenizer*", "config.json"],
395+
local_dir=cache_dir,
396+
)
397+
elif llm_spec.model_hub == "modelscope":
398+
from modelscope.hub.snapshot_download import snapshot_download
399+
400+
download_dir = retry_download(
401+
snapshot_download,
402+
llm_family.model_name,
403+
{
404+
"model_size": llm_spec.model_size_in_billions,
405+
"model_format": llm_spec.model_format,
406+
},
407+
llm_spec.model_id,
408+
revision=llm_spec.model_revision,
409+
allow_patterns=["tokenizer*", "config.json"],
410+
local_dir=cache_dir,
411+
)
412+
else:
413+
raise NotImplementedError(
414+
f"Does not support download config.json and "
415+
f"tokenizer related files via {llm_spec.model_hub}"
416+
)
417+
return download_dir
418+
419+
373420
def cache_model_config(
374421
llm_family: LLMFamilyV1,
375422
llm_spec: "LLMSpecV1",
376423
):
377424
"""Download model config.json into cache_dir,
378425
returns local filepath
379426
"""
380-
cache_dir = _get_cache_dir_for_model_mem(llm_family, llm_spec)
427+
cache_dir = _get_cache_dir_for_model_mem(llm_family, llm_spec, "model_mem")
381428
config_file = os.path.join(cache_dir, "config.json")
382429
if not os.path.islink(config_file) and not os.path.exists(config_file):
383430
os.makedirs(cache_dir, exist_ok=True)
@@ -400,10 +447,13 @@ def cache_model_config(
400447
def _get_cache_dir_for_model_mem(
401448
llm_family: LLMFamilyV1,
402449
llm_spec: "LLMSpecV1",
450+
category: str,
403451
create_if_not_exist=True,
404452
):
405453
"""
406-
For cal-model-mem only. (might called from supervisor / cli)
454+
Get file dir for special usage, like `cal-model-mem` and download partial files for
455+
456+
e.g. for cal-model-mem, (might called from supervisor / cli)
407457
Temporary use separate dir from worker's cache_dir, due to issue of different style of symlink.
408458
"""
409459
quant_suffix = ""
@@ -418,7 +468,7 @@ def _get_cache_dir_for_model_mem(
418468
if quant_suffix:
419469
cache_dir_name += f"-{quant_suffix}"
420470
cache_dir = os.path.realpath(
421-
os.path.join(XINFERENCE_CACHE_DIR, "model_mem", cache_dir_name)
471+
os.path.join(XINFERENCE_CACHE_DIR, category, cache_dir_name)
422472
)
423473
if create_if_not_exist and not os.path.exists(cache_dir):
424474
os.makedirs(cache_dir, exist_ok=True)

xinference/model/llm/vllm/core.py

+55-4
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
LoRA,
5252
)
5353
from .. import LLM, LLMFamilyV1, LLMSpecV1
54-
from ..llm_family import CustomLLMFamilyV1
54+
from ..llm_family import CustomLLMFamilyV1, cache_model_tokenizer_and_config
5555
from ..utils import (
5656
DEEPSEEK_TOOL_CALL_FAMILY,
5757
QWEN_TOOL_CALL_FAMILY,
@@ -331,8 +331,10 @@ def load(self):
331331

332332
raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
333333

334-
if vllm.__version__ >= "0.3.1":
335-
# from vllm v0.3.1, it uses cupy as NCCL backend
334+
from ..llm_family import LlamaCppLLMSpecV1
335+
336+
if "0.3.1" <= vllm.__version__ <= "0.3.3":
337+
# from vllm v0.3.1 to v0.3.3, it uses cupy as NCCL backend
336338
# in which cupy will fork a process
337339
# only for xoscar >= 0.3.0, new process is allowed in subpool
338340
# besides, xinference set start method as forkserver for unix
@@ -345,6 +347,13 @@ def load(self):
345347

346348
self.prepare_parse_reasoning_content(reasoning_content)
347349

350+
if (
351+
isinstance(self.model_spec, LlamaCppLLMSpecV1)
352+
and self.model_spec.model_format == "ggufv2"
353+
):
354+
# gguf
355+
self._preprocess_load_gguf()
356+
348357
if self.lora_modules is None:
349358
self.lora_requests = []
350359
else:
@@ -483,6 +492,45 @@ def wait_for_load(self):
483492
_, err, tb = self._loading_error
484493
raise err.with_traceback(tb)
485494

495+
def _preprocess_load_gguf(self):
496+
# check if it is multi gguf files
497+
if (
498+
not os.path.isfile(self.model_path)
499+
and self.model_spec.quantization_parts
500+
and self.quantization in self.model_spec.quantization_parts
501+
):
502+
raise RuntimeError(
503+
"vllm does not support multiple gguf files, please merge them first and "
504+
"provide `model_path` with merged file"
505+
)
506+
507+
if "tokenizer" not in self._model_config:
508+
# find pytorch format without quantization
509+
non_quant_spec = next(
510+
spec
511+
for spec in self.model_family.model_specs
512+
if spec.model_format == "pytorch"
513+
and "none" in spec.quantizations
514+
and spec.model_size_in_billions
515+
== self.model_spec.model_size_in_billions
516+
)
517+
518+
path = cache_model_tokenizer_and_config(self.model_family, non_quant_spec)
519+
# other than gguf file, vllm requires to provide tokenizer and hf_config_path
520+
self._model_config["tokenizer"] = self._model_config[
521+
"hf_config_path"
522+
] = path
523+
524+
if not os.path.isfile(self.model_path):
525+
self.model_path = os.path.realpath(
526+
os.path.join(
527+
self.model_path,
528+
self.model_spec.model_file_name_template.format(
529+
quantization=self.quantization
530+
),
531+
)
532+
)
533+
486534
def stop(self):
487535
# though the vLLM engine will shutdown when deleted,
488536
# but some issue e.g. GH#1682 reported
@@ -943,7 +991,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
943991
def match_json(
944992
cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
945993
) -> bool:
946-
if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
994+
if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "ggufv2"]:
947995
return False
948996
if llm_spec.model_format == "pytorch":
949997
if quantization != "none" and not (quantization is None):
@@ -959,6 +1007,9 @@ def match_json(
9591007
else:
9601008
if "4" not in quantization:
9611009
return False
1010+
if llm_spec.model_format == "ggufv2":
1011+
if not (VLLM_INSTALLED and vllm.__version__ >= "0.8.2"):
1012+
return False
9621013
if isinstance(llm_family, CustomLLMFamilyV1):
9631014
if llm_family.model_family not in VLLM_SUPPORTED_CHAT_MODELS:
9641015
return False

0 commit comments

Comments
 (0)