FEAT: ovis2 (#3170)

Minamiyama · web-flow · commit 2b796a93cc9d · 2025-04-28T18:30:07.000+08:00
diff --git a/xinference/model/llm/__init__.py b/xinference/model/llm/__init__.py
@@ -151,6 +151,7 @@ def _install():
     from .transformers.minicpmv25 import MiniCPMV25Model
     from .transformers.minicpmv26 import MiniCPMV26Model
     from .transformers.opt import OptPytorchModel
+    from .transformers.ovis2 import Ovis2ChatModel
     from .transformers.qwen2_audio import Qwen2AudioChatModel
     from .transformers.qwen_vl import QwenVLChatModel
     from .transformers.yi_vl import YiVLChatModel
@@ -199,6 +200,7 @@ def _install():
             CogAgentChatModel,
             Gemma3TextChatModel,
             Gemma3ChatModel,
+            Ovis2ChatModel,
         ]
     )
     if OmniLMMModel:  # type: ignore
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
@@ -11181,6 +11181,120 @@
       ]
     }
   },
+  {
+    "version":1,
+    "context_length":32768,
+    "model_name":"Ovis2",
+    "model_lang":[
+      "en",
+      "zh"
+    ],
+    "model_ability":[
+      "chat",
+      "vision"
+    ],
+    "model_description":"Ovis (Open VISion) is a novel Multimodal Large Language Model (MLLM) architecture, designed to structurally align visual and textual embeddings.",
+    "model_specs":[
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":1,
+        "quantizations":[
+          "none"
+        ],
+        "model_id":"AIDC-AI/Ovis2-1B"
+      },
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":2,
+        "quantizations":[
+          "none"
+        ],
+        "model_id":"AIDC-AI/Ovis2-2B"
+      },
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":4,
+        "quantizations":[
+          "none"
+        ],
+        "model_id":"AIDC-AI/Ovis2-4B"
+      },
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":8,
+        "quantizations":[
+          "none"
+        ],
+        "model_id":"AIDC-AI/Ovis2-8B"
+      },
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":16,
+        "quantizations":[
+          "none"
+        ],
+        "model_id":"AIDC-AI/Ovis2-16B"
+      },
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":34,
+        "quantizations":[
+          "none"
+        ],
+        "model_id":"AIDC-AI/Ovis2-34B"
+      },
+      {
+        "model_format":"gptq",
+        "model_size_in_billions":2,
+        "quantizations":[
+          "Int4"
+        ],
+        "model_id":"AIDC-AI/Ovis2-2B-GPTQ-{quantization}"
+      },
+      {
+        "model_format":"gptq",
+        "model_size_in_billions":4,
+        "quantizations":[
+          "Int4"
+        ],
+        "model_id":"AIDC-AI/Ovis2-4B-GPTQ-{quantization}"
+      },
+      {
+        "model_format":"gptq",
+        "model_size_in_billions":8,
+        "quantizations":[
+          "Int4"
+        ],
+        "model_id":"AIDC-AI/Ovis2-8B-GPTQ-{quantization}"
+      },
+      {
+        "model_format":"gptq",
+        "model_size_in_billions":16,
+        "quantizations":[
+          "Int4"
+        ],
+        "model_id":"AIDC-AI/Ovis2-16B-GPTQ-{quantization}"
+      },
+      {
+        "model_format":"gptq",
+        "model_size_in_billions":34,
+        "quantizations":[
+          "Int4",
+          "Int8"
+        ],
+        "model_id":"AIDC-AI/Ovis2-34B-GPTQ-{quantization}"
+      }
+    ],
+    "chat_template":  "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+    "stop_token_ids": [
+      151645,
+      151643
+    ],
+    "stop": [
+      "<|im_end|>",
+      "<|endoftext|>"
+    ]
+  },
   {
     "version": 1,
     "context_length": 32768,
diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
@@ -8968,6 +8968,131 @@
       ]
     }
   },
+  {
+    "version":1,
+    "context_length":32768,
+    "model_name":"Ovis2",
+    "model_lang":[
+      "en",
+      "zh"
+    ],
+    "model_ability":[
+      "chat",
+      "vision"
+    ],
+    "model_description":"Ovis (Open VISion) is a novel Multimodal Large Language Model (MLLM) architecture, designed to structurally align visual and textual embeddings.",
+    "model_specs":[
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":1,
+        "quantizations":[
+          "none"
+        ],
+        "model_id":"AIDC-AI/Ovis2-1B",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":2,
+        "quantizations":[
+          "none"
+        ],
+        "model_id":"AIDC-AI/Ovis2-2B",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":4,
+        "quantizations":[
+          "none"
+        ],
+        "model_id":"AIDC-AI/Ovis2-4B",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":8,
+        "quantizations":[
+          "none"
+        ],
+        "model_id":"AIDC-AI/Ovis2-8B",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":16,
+        "quantizations":[
+          "none"
+        ],
+        "model_id":"AIDC-AI/Ovis2-16B",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":34,
+        "quantizations":[
+          "none"
+        ],
+        "model_id":"AIDC-AI/Ovis2-34B",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format":"gptq",
+        "model_size_in_billions":2,
+        "quantizations":[
+          "Int4"
+        ],
+        "model_id":"AIDC-AI/Ovis2-2B-GPTQ-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format":"gptq",
+        "model_size_in_billions":4,
+        "quantizations":[
+          "Int4"
+        ],
+        "model_id":"AIDC-AI/Ovis2-4B-GPTQ-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format":"gptq",
+        "model_size_in_billions":8,
+        "quantizations":[
+          "Int4"
+        ],
+        "model_id":"AIDC-AI/Ovis2-8B-GPTQ-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format":"gptq",
+        "model_size_in_billions":16,
+        "quantizations":[
+          "Int4"
+        ],
+        "model_id":"AIDC-AI/Ovis2-16B-GPTQ-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format":"gptq",
+        "model_size_in_billions":34,
+        "quantizations":[
+          "Int4",
+          "Int8"
+        ],
+        "model_id":"AIDC-AI/Ovis2-34B-GPTQ-{quantization}",
+        "model_hub": "modelscope"
+      }
+    ],
+    "chat_template":  "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+    "stop_token_ids": [
+      151645,
+      151643
+    ],
+    "stop": [
+      "<|im_end|>",
+      "<|endoftext|>"
+    ]
+  },
   {
     "version": 1,
     "context_length": 32768,
diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py
@@ -75,6 +75,7 @@
     "cogagent",
     "gemma-3-1b-it",
     "gemma-3-it",
+    "Ovis2",
     "deepseek-vl2",
 ]
 
diff --git a/xinference/model/llm/transformers/ovis2.py b/xinference/model/llm/transformers/ovis2.py

Original file line number	Diff line number	Diff line change
`@@ -75,6 +75,7 @@`
`75`	`75`	`"cogagent",`
`76`	`76`	`"gemma-3-1b-it",`
`77`	`77`	`"gemma-3-it",`
	`78`	`+ "Ovis2",`
`78`	`79`	`"deepseek-vl2",`
`79`	`80`	`]`
`80`	`81`