xorbitsai · qinxuye · Apr 18, 2025 · Apr 17, 2025 · Apr 17, 2025 · Apr 17, 2025
diff --git a/setup.cfg b/setup.cfg
@@ -147,6 +147,7 @@ all =
     jieba  # For F5-TTS
     soundfile  # For F5-TTS
     qwen-vl-utils!=0.0.9 # For qwen2-vl
+    qwen_omni_utils  # For qwen2.5-omni
     datamodel_code_generator # for minicpm-4B
     jsonschema # for minicpm-4B
     verovio>=4.3.1  # For got_ocr2
@@ -180,6 +181,7 @@ transformers =
     eva-decord  # For video in VL
     jj-pytorchvideo # For CogVLM2-video
     qwen-vl-utils!=0.0.9 # For qwen2-vl
+    qwen_omni_utils  # For qwen2.5-omni
     datamodel_code_generator # for minicpm-4B
     jsonschema # for minicpm-4B
     blobfile #for moonlight-16b-a3b

diff --git a/xinference/core/chat_interface.py b/xinference/core/chat_interface.py
@@ -16,6 +16,7 @@
 import html
 import logging
 import os
+import tempfile
 from io import BytesIO
 from typing import Generator, List, Optional
 
@@ -66,7 +67,7 @@ def __init__(
 
     def build(self) -> "gr.Blocks":
         if "vision" in self.model_ability:
-            interface = self.build_chat_vl_interface()
+            interface = self.build_chat_multimodel_interface()
         elif "chat" in self.model_ability:
             interface = self.build_chat_interface()
         else:
@@ -330,7 +331,7 @@ def generate_wrapper(
 
             return chat_interface
 
-    def build_chat_vl_interface(
+    def build_chat_multimodel_interface(
         self,
     ) -> "gr.Blocks":
         def predict(history, bot, max_tokens, temperature, stream):
@@ -377,11 +378,46 @@ def predict(history, bot, max_tokens, temperature, stream):
                     },
                 )
                 history.append(response["choices"][0]["message"])
-                bot[-1][1] = history[-1]["content"]
-                yield history, bot
+                if "audio" in history[-1]:
+                    # audio output
+                    audio_bytes = base64.b64decode(history[-1]["audio"]["data"])
+                    audio_file = tempfile.NamedTemporaryFile(
+                        delete=False, suffix=".wav"
+                    )
+                    audio_file.write(audio_bytes)
+                    audio_file.close()
+
+                    def audio_to_base64(audio_path):
+                        with open(audio_path, "rb") as audio_file:
+                            return base64.b64encode(audio_file.read()).decode("utf-8")
+
+                    def generate_html_audio(audio_path):
+                        base64_audio = audio_to_base64(audio_path)
+                        audio_format = audio_path.split(".")[-1]
+                        return (
+                            f"<audio controls style='max-width:100%;'>"
+                            f"<source src='data:audio/{audio_format};base64,{base64_audio}' type='audio/{audio_format}'>"
+                            f"Your browser does not support the audio tag.</audio>"
+                        )
+
+                    bot[-1] = (bot[-1][0], history[-1]["content"])
+                    yield history, bot
 
-        def add_text(history, bot, text, image, video):
-            logger.debug("Add text, text: %s, image: %s, video: %s", text, image, video)
+                    # append html audio tag instead of gr.Audio
+                    bot.append((None, generate_html_audio(audio_file.name)))
+                    yield history, bot
+                else:
+                    bot[-1][1] = history[-1]["content"]
+                    yield history, bot
+
+        def add_text(history, bot, text, image, video, audio):
+            logger.debug(
+                "Add text, text: %s, image: %s, video: %s, audio: %s",
+                text,
+                image,
+                video,
+                audio,
+            )
             if image:
                 buffered = BytesIO()
                 with PIL.Image.open(image) as img:
@@ -432,20 +468,54 @@ def generate_html_video(video_path):
                         },
                     ],
                 }
+
+            elif audio:
+
+                def audio_to_base64(audio_path):
+                    with open(audio_path, "rb") as audio_file:
+                        encoded_string = base64.b64encode(audio_file.read()).decode(
+                            "utf-8"
+                        )
+                    return encoded_string
+
+                def generate_html_audio(audio_path):
+                    base64_audio = audio_to_base64(audio_path)
+                    audio_format = audio_path.split(".")[-1]
+                    return (
+                        f"<audio controls style='max-width:100%;'>"
+                        f"<source src='data:audio/{audio_format};base64,{base64_audio}' type='audio/{audio_format}'>"
+                        f"Your browser does not support the audio tag.</audio>"
+                    )
+
+                display_content = f"{generate_html_audio(audio)}<br>{text}"
+                message = {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": text},
+                        {
+                            "type": "audio_url",
+                            "audio_url": {"url": audio},
+                        },
+                    ],
+                }
+
             else:
                 display_content = text
                 message = {"role": "user", "content": text}
             history = history + [message]
             bot = bot + [[display_content, None]]
-            return history, bot, "", None, None
+            return history, bot, "", None, None, None
 
         def clear_history():
             logger.debug("Clear history.")
-            return [], None, "", None, None
+            return [], None, "", None, None, None
 
         def update_button(text):
             return gr.update(interactive=bool(text))
 
+        has_vision = "vision" in self.model_ability
+        has_audio = "audio" in self.model_ability
+
         with gr.Blocks(
             title=f"🚀 Xinference Chat Bot : {self.model_name} 🚀",
             css="""
@@ -484,11 +554,29 @@ def update_button(text):
             state = gr.State([])
             with gr.Row():
                 chatbot = gr.Chatbot(
-                    elem_id="chatbot", label=self.model_name, height=700, scale=7
+                    elem_id="chatbot", label=self.model_name, scale=7, min_height=900
                 )
                 with gr.Column(scale=3):
-                    imagebox = gr.Image(type="filepath")
-                    videobox = gr.Video()
+                    if has_vision:
+                        imagebox = gr.Image(type="filepath")
+                        videobox = gr.Video()
+                    else:
+                        imagebox = gr.Image(type="filepath", visible=False)
+                        videobox = gr.Video(visible=False)
+
+                    if has_audio:
+                        audiobox = gr.Audio(
+                            sources=["microphone", "upload"],
+                            type="filepath",
+                            visible=True,
+                        )
+                    else:
+                        audiobox = gr.Audio(
+                            sources=["microphone", "upload"],
+                            type="filepath",
+                            visible=False,
+                        )
+
                     textbox = gr.Textbox(
                         show_label=False,
                         placeholder="Enter text and press ENTER",
@@ -516,8 +604,8 @@ def update_button(text):
 
             textbox.submit(
                 add_text,
-                [state, chatbot, textbox, imagebox, videobox],
-                [state, chatbot, textbox, imagebox, videobox],
+                [state, chatbot, textbox, imagebox, videobox, audiobox],
+                [state, chatbot, textbox, imagebox, videobox, audiobox],
                 queue=False,
             ).then(
                 predict,
@@ -527,8 +615,8 @@ def update_button(text):
 
             submit_btn.click(
                 add_text,
-                [state, chatbot, textbox, imagebox, videobox],
-                [state, chatbot, textbox, imagebox, videobox],
+                [state, chatbot, textbox, imagebox, videobox, audiobox],
+                [state, chatbot, textbox, imagebox, videobox, audiobox],
                 queue=False,
             ).then(
                 predict,
@@ -539,7 +627,7 @@ def update_button(text):
             clear_btn.click(
                 clear_history,
                 None,
-                [state, chatbot, textbox, imagebox, videobox],
+                [state, chatbot, textbox, imagebox, videobox, audiobox],
                 queue=False,
             )
 

diff --git a/xinference/deploy/docker/requirements.txt b/xinference/deploy/docker/requirements.txt
@@ -99,6 +99,7 @@ misaki[en,ja,zh]>=0.7.15  # Kokoro
 en_core_web_trf@https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl  # Kokoro misaki[en]
 en_core_web_sm@https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl  # Kokoro misaki[en]
 qwen-vl-utils!=0.0.9 # For qwen2-vl
+qwen_omni_utils  # For qwen2.5-omni
 datamodel_code_generator # for minicpm-4B
 jsonschema # for minicpm-4B
 deepcache # for sd

diff --git a/xinference/deploy/docker/requirements_cpu.txt b/xinference/deploy/docker/requirements_cpu.txt
@@ -92,6 +92,7 @@ misaki[en,ja,zh]>=0.7.15  # Kokoro
 en_core_web_trf@https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl  # Kokoro misaki[en]
 en_core_web_sm@https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl  # Kokoro misaki[en]
 qwen-vl-utils!=0.0.9 # For qwen2-vl
+qwen_omni_utils  # For qwen2.5-omni
 datamodel_code_generator # for minicpm-4B
 jsonschema # for minicpm-4B
 verovio>=4.3.1  # For got_ocr2

diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
@@ -7909,6 +7909,47 @@
       "<|endoftext|>"
     ]
   },
+  {
+    "version":1,
+    "context_length":32768,
+    "model_name":"qwen2.5-omni",
+    "model_lang":[
+      "en",
+      "zh"
+    ],
+    "model_ability":[
+      "chat",
+      "vision",
+      "audio",
+      "omni"
+    ],
+    "model_description":"Qwen2.5-Omni: the new flagship end-to-end multimodal model in the Qwen series.",
+    "model_specs":[
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":7,
+        "quantizations":[
+          "none"
+        ],
+        "model_id":"Qwen/Qwen2.5-Omni-7B"
+      }
+    ],
+    "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
+    "stop_token_ids": [
+      151645,
+      151643
+    ],
+    "stop": [
+      "<|im_end|>",
+      "<|endoftext|>"
+    ],
+    "virtualenv": {
+      "packages": [
+        "git+https://github.com/huggingface/[email protected]",
+        "numpy==1.26.4"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 32768,

diff --git a/xinference/model/llm/llm_family.py b/xinference/model/llm/llm_family.py
@@ -135,7 +135,9 @@ class LLMFamilyV1(BaseModel):
     model_name: str
     model_lang: List[str]
     model_ability: List[
-        Literal["embed", "generate", "chat", "tools", "vision", "audio", "reasoning"]
+        Literal[
+            "embed", "generate", "chat", "tools", "vision", "audio", "omni", "reasoning"
+        ]
     ]
     model_description: Optional[str]
     # reason for not required str here: legacy registration

diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
@@ -5666,6 +5666,48 @@
       "<|endoftext|>"
     ]
   },
+  {
+    "version":1,
+    "context_length":32768,
+    "model_name":"qwen2.5-omni",
+    "model_lang":[
+      "en",
+      "zh"
+    ],
+    "model_ability":[
+      "chat",
+      "vision",
+      "audio",
+      "omni"
+    ],
+    "model_description":"Qwen2.5-Omni: the new flagship end-to-end multimodal model in the Qwen series.",
+    "model_specs":[
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":7,
+        "quantizations":[
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id":"Qwen/Qwen2.5-Omni-7B"
+      }
+    ],
+    "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
+    "stop_token_ids": [
+      151645,
+      151643
+    ],
+    "stop": [
+      "<|im_end|>",
+      "<|endoftext|>"
+    ],
+    "virtualenv": {
+      "packages": [
+        "git+https://github.com/huggingface/[email protected]",
+        "numpy==1.26.4"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 32768,