FEAT: add ggufv2 support for vLLM (#3259)

harryzwh · harryzwh · qinxuye · web-flow · commit aa4d536fd73f · 2025-04-27T16:02:18.000+08:00
Co-authored-by: harryzwh &lt;harryzwh@gmail.com&gt;
Co-authored-by: qinxuye &lt;qinxuye@gmail.com&gt;
diff --git a/xinference/model/llm/__init__.py b/xinference/model/llm/__init__.py
@@ -57,7 +57,7 @@
 
 def check_format_with_engine(model_format, engine):
     # only llama-cpp-python support and only support ggufv2
-    if model_format in ["ggufv2"] and engine != "llama.cpp":
+    if model_format in ["ggufv2"] and engine not in ["llama.cpp", "vLLM"]:
         return False
     if model_format not in ["ggufv2"] and engine == "llama.cpp":
         return False
diff --git a/xinference/model/llm/llm_family.py b/xinference/model/llm/llm_family.py
@@ -370,14 +370,61 @@ def cache_from_uri(
         raise ValueError(f"Unsupported URL scheme: {src_scheme}")
 
 
+def cache_model_tokenizer_and_config(
+    llm_family: LLMFamilyV1,
+    llm_spec: "LLMSpecV1",
+) -> str:
+    """
+    Download model config.json and tokenizers only
+    """
+    cache_dir = _get_cache_dir_for_model_mem(llm_family, llm_spec, "tokenizer_config")
+    os.makedirs(cache_dir, exist_ok=True)
+    if llm_spec.model_hub == "huggingface":
+        from huggingface_hub import snapshot_download
+
+        download_dir = retry_download(
+            snapshot_download,
+            llm_family.model_name,
+            {
+                "model_size": llm_spec.model_size_in_billions,
+                "model_format": llm_spec.model_format,
+            },
+            llm_spec.model_id,
+            revision=llm_spec.model_revision,
+            allow_patterns=["tokenizer*", "config.json"],
+            local_dir=cache_dir,
+        )
+    elif llm_spec.model_hub == "modelscope":
+        from modelscope.hub.snapshot_download import snapshot_download
+
+        download_dir = retry_download(
+            snapshot_download,
+            llm_family.model_name,
+            {
+                "model_size": llm_spec.model_size_in_billions,
+                "model_format": llm_spec.model_format,
+            },
+            llm_spec.model_id,
+            revision=llm_spec.model_revision,
+            allow_patterns=["tokenizer*", "config.json"],
+            local_dir=cache_dir,
+        )
+    else:
+        raise NotImplementedError(
+            f"Does not support download config.json and "
+            f"tokenizer related files via {llm_spec.model_hub}"
+        )
+    return download_dir
+
+
 def cache_model_config(
     llm_family: LLMFamilyV1,
     llm_spec: "LLMSpecV1",
 ):
     """Download model config.json into cache_dir,
     returns local filepath
     """
-    cache_dir = _get_cache_dir_for_model_mem(llm_family, llm_spec)
+    cache_dir = _get_cache_dir_for_model_mem(llm_family, llm_spec, "model_mem")
     config_file = os.path.join(cache_dir, "config.json")
     if not os.path.islink(config_file) and not os.path.exists(config_file):
         os.makedirs(cache_dir, exist_ok=True)
@@ -400,10 +447,13 @@ def cache_model_config(
 def _get_cache_dir_for_model_mem(
     llm_family: LLMFamilyV1,
     llm_spec: "LLMSpecV1",
+    category: str,
     create_if_not_exist=True,
 ):
     """
-    For cal-model-mem only. (might called from supervisor / cli)
+    Get file dir for special usage, like `cal-model-mem` and download partial files for
+
+    e.g. for cal-model-mem, (might called from supervisor / cli)
     Temporary use separate dir from worker's cache_dir, due to issue of different style of symlink.
     """
     quant_suffix = ""
@@ -418,7 +468,7 @@ def _get_cache_dir_for_model_mem(
     if quant_suffix:
         cache_dir_name += f"-{quant_suffix}"
     cache_dir = os.path.realpath(
-        os.path.join(XINFERENCE_CACHE_DIR, "model_mem", cache_dir_name)
+        os.path.join(XINFERENCE_CACHE_DIR, category, cache_dir_name)
     )
     if create_if_not_exist and not os.path.exists(cache_dir):
         os.makedirs(cache_dir, exist_ok=True)
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
@@ -51,7 +51,7 @@
     LoRA,
 )
 from .. import LLM, LLMFamilyV1, LLMSpecV1
-from ..llm_family import CustomLLMFamilyV1
+from ..llm_family import CustomLLMFamilyV1, cache_model_tokenizer_and_config
 from ..utils import (
     DEEPSEEK_TOOL_CALL_FAMILY,
     QWEN_TOOL_CALL_FAMILY,
@@ -331,8 +331,10 @@ def load(self):
 
             raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
 
-        if vllm.__version__ >= "0.3.1":
-            # from vllm v0.3.1, it uses cupy as NCCL backend
+        from ..llm_family import LlamaCppLLMSpecV1
+
+        if "0.3.1" <= vllm.__version__ <= "0.3.3":
+            # from vllm v0.3.1 to v0.3.3, it uses cupy as NCCL backend
             # in which cupy will fork a process
             # only for xoscar >= 0.3.0, new process is allowed in subpool
             # besides, xinference set start method as forkserver for unix
@@ -345,6 +347,13 @@ def load(self):
 
         self.prepare_parse_reasoning_content(reasoning_content)
 
+        if (
+            isinstance(self.model_spec, LlamaCppLLMSpecV1)
+            and self.model_spec.model_format == "ggufv2"
+        ):
+            # gguf
+            self._preprocess_load_gguf()
+
         if self.lora_modules is None:
             self.lora_requests = []
         else:
@@ -483,6 +492,45 @@ def wait_for_load(self):
                 _, err, tb = self._loading_error
                 raise err.with_traceback(tb)
 
+    def _preprocess_load_gguf(self):
+        # check if it is multi gguf files
+        if (
+            not os.path.isfile(self.model_path)
+            and self.model_spec.quantization_parts
+            and self.quantization in self.model_spec.quantization_parts
+        ):
+            raise RuntimeError(
+                "vllm does not support multiple gguf files, please merge them first and "
+                "provide `model_path` with merged file"
+            )
+
+        if "tokenizer" not in self._model_config:
+            # find pytorch format without quantization
+            non_quant_spec = next(
+                spec
+                for spec in self.model_family.model_specs
+                if spec.model_format == "pytorch"
+                and "none" in spec.quantizations
+                and spec.model_size_in_billions
+                == self.model_spec.model_size_in_billions
+            )
+
+            path = cache_model_tokenizer_and_config(self.model_family, non_quant_spec)
+            # other than gguf file, vllm requires to provide tokenizer and hf_config_path
+            self._model_config["tokenizer"] = self._model_config[
+                "hf_config_path"
+            ] = path
+
+        if not os.path.isfile(self.model_path):
+            self.model_path = os.path.realpath(
+                os.path.join(
+                    self.model_path,
+                    self.model_spec.model_file_name_template.format(
+                        quantization=self.quantization
+                    ),
+                )
+            )
+
     def stop(self):
         # though the vLLM engine will shutdown when deleted,
         # but some issue e.g. GH#1682 reported
@@ -943,7 +991,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
     def match_json(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
+        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "ggufv2"]:
             return False
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and not (quantization is None):
@@ -959,6 +1007,9 @@ def match_json(
             else:
                 if "4" not in quantization:
                     return False
+        if llm_spec.model_format == "ggufv2":
+            if not (VLLM_INSTALLED and vllm.__version__ >= "0.8.2"):
+                return False
         if isinstance(llm_family, CustomLLMFamilyV1):
             if llm_family.model_family not in VLLM_SUPPORTED_CHAT_MODELS:
                 return False