ENH: Support GLM4-0414 MLX and GGUF (#3325)

Jun-Howie · qinxuye · web-flow · commit a37da2e7c700 · 2025-04-25T21:09:14.000+08:00
Co-authored-by: qinxuye &lt;qinxuye@gmail.com&gt;
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
@@ -11075,6 +11075,91 @@
           "none"
         ],
         "model_id": "THUDM/GLM-4-32B-0414"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "4bit",
+          "6bit",
+          "8bit",
+          "bf16"
+        ],
+        "model_id": "mlx-community/GLM-4-9B-0414-{quantization}"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "4bit",
+          "8bit"
+        ],
+        "model_id": "mlx-community/GLM-4-32B-0414-{quantization}"
+      },
+       {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "IQ2_M",
+          "IQ3_M",
+          "IQ3_XS",
+          "IQ3_XXS",
+          "IQ4_NL",
+          "IQ4_XS",
+          "Q2_K",
+          "Q2_K_L",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q3_K_XL",
+          "Q4_0",
+          "Q4_1",
+          "Q4_K_L",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_K_L",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q6_K_L",
+          "Q8_0",
+          "bf16"
+        ],
+        "model_id": "bartowski/THUDM_GLM-4-9B-0414-GGUF",
+        "model_file_name_template": "THUDM_GLM-4-9B-0414-{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "IQ2_M",
+          "IQ2_S",
+          "IQ2_XS",
+          "IQ3_M",
+          "IQ3_XS",
+          "IQ3_XXS",
+          "IQ4_NL",
+          "IQ4_XS",
+          "Q2_K",
+          "Q2_K_L",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q3_K_XL",
+          "Q4_0",
+          "Q4_1",
+          "Q4_K_L",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_K_L",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q6_K_L",
+          "Q8_0"
+        ],
+        "model_id": "bartowski/THUDM_GLM-4-9B-0414-GGUF",
+        "model_file_name_template": "THUDM_GLM-4-9B-0414-{quantization}.gguf"
       }
     ],
     "chat_template": "[gMASK]<sop>{%- if tools -%}<|system|>\n# 可用工具\n{% for tool in tools %}{%- set function = tool.function if tool.get(\"function\") else tool %}\n\n## {{ function.name }}\n\n{{ function | tojson(indent=4, ensure_ascii=False) }}\n在调用上述函数时，请使用 Json 格式表示调用的参数。{%- endfor %}{%- endif -%}{%- for msg in messages %}{%- if msg.role == 'system' %}<|system|>\n{{ msg.content }}{%- endif %}{%- endfor %}{%- for message in messages if message.role != 'system' %}{%- set role = message['role'] %}{%- set content = message['content'] %}{%- set meta = message.get(\"metadata\", \"\") %}{%- if role == 'user' %}<|user|>\n{{ content }}{%- elif role == 'assistant' and not meta %}<|assistant|>\n{{ content }}{%- elif role == 'assistant' and meta %}<|assistant|>{{ meta }} \n{{ content }}{%- elif role == 'observation' %}<|observation|>\n{{ content }}{%- endif %}{%- endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}",
@@ -11087,7 +11172,14 @@
       "<|endoftext|>",
       "<|user|>",
       "<|observation|>"
-    ]
+    ],
+    "virtualenv": {
+      "packages": [
+        "transformers>=4.51.3",
+        "mlx-lm>=0.23.1 ; sys_platform=='darwin'",
+        "numpy==1.26.4"
+      ]
+    }
   },
   {
     "version": 1,
diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
@@ -8858,6 +8858,95 @@
         ],
         "model_id": "ZhipuAI/GLM-4-32B-0414",
         "model_hub": "modelscope"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "4bit",
+          "6bit",
+          "8bit",
+          "bf16"
+        ],
+        "model_id": "mlx-community/GLM-4-9B-0414-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "4bit",
+          "8bit"
+        ],
+        "model_id": "mlx-community/GLM-4-32B-0414-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "IQ2_M",
+          "IQ3_M",
+          "IQ3_XS",
+          "IQ3_XXS",
+          "IQ4_NL",
+          "IQ4_XS",
+          "Q2_K",
+          "Q2_K_L",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q3_K_XL",
+          "Q4_0",
+          "Q4_1",
+          "Q4_K_L",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_K_L",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q6_K_L",
+          "Q8_0",
+          "bf16"
+        ],
+        "model_id": "bartowski/THUDM_GLM-4-9B-0414-GGUF",
+        "model_file_name_template": "THUDM_GLM-4-9B-0414-{quantization}.gguf",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "IQ2_M",
+          "IQ2_S",
+          "IQ2_XS",
+          "IQ3_M",
+          "IQ3_XS",
+          "IQ3_XXS",
+          "IQ4_NL",
+          "IQ4_XS",
+          "Q2_K",
+          "Q2_K_L",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q3_K_XL",
+          "Q4_0",
+          "Q4_1",
+          "Q4_K_L",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_K_L",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q6_K_L",
+          "Q8_0"
+        ],
+        "model_id": "bartowski/THUDM_GLM-4-9B-0414-GGUF",
+        "model_file_name_template": "THUDM_GLM-4-9B-0414-{quantization}.gguf",
+        "model_hub": "modelscope"
       }
     ],
     "chat_template": "[gMASK]<sop>{%- if tools -%}<|system|>\n# 可用工具\n{% for tool in tools %}{%- set function = tool.function if tool.get(\"function\") else tool %}\n\n## {{ function.name }}\n\n{{ function | tojson(indent=4, ensure_ascii=False) }}\n在调用上述函数时，请使用 Json 格式表示调用的参数。{%- endfor %}{%- endif -%}{%- for msg in messages %}{%- if msg.role == 'system' %}<|system|>\n{{ msg.content }}{%- endif %}{%- endfor %}{%- for message in messages if message.role != 'system' %}{%- set role = message['role'] %}{%- set content = message['content'] %}{%- set meta = message.get(\"metadata\", \"\") %}{%- if role == 'user' %}<|user|>\n{{ content }}{%- elif role == 'assistant' and not meta %}<|assistant|>\n{{ content }}{%- elif role == 'assistant' and meta %}<|assistant|>{{ meta }} \n{{ content }}{%- elif role == 'observation' %}<|observation|>\n{{ content }}{%- endif %}{%- endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}",
@@ -8870,7 +8959,14 @@
       "<|endoftext|>",
       "<|user|>",
       "<|observation|>"
-    ]
+    ],
+    "virtualenv": {
+      "packages": [
+        "transformers>=4.51.3",
+        "mlx-lm>=0.23.1 ; sys_platform=='darwin'",
+        "numpy==1.26.4"
+      ]
+    }
   },
   {
     "version": 1,
diff --git a/xinference/model/llm/transformers/deepseek_vl.py b/xinference/model/llm/transformers/deepseek_vl.py
@@ -46,7 +46,7 @@ def match_json(
         cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         llm_family = model_family.model_family or model_family.model_name
-        if "deepseek-vl" == llm_family.lower():
+        if "deepseek-vl-chat" == llm_family.lower():
             return True
         return False
 
diff --git a/xinference/web/ui/src/scenes/launch_model/data/data.js b/xinference/web/ui/src/scenes/launch_model/data/data.js
@@ -68,7 +68,7 @@ export const featureModels = [
       'deepseek-r1-distill-llama',
       'qwen2.5-instruct',
       'qwen2.5-vl-instruct',
-      'qwen2.5-coder-instruct',
+      'glm4-0414',
       'QwQ-32B',
       'llama-3.1-instruct',
       'gemma-3-it',