Skip to content

FEAT: support qwen2.5-omni #3279

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Apr 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ all =
jieba # For F5-TTS
soundfile # For F5-TTS
qwen-vl-utils!=0.0.9 # For qwen2-vl
qwen_omni_utils # For qwen2.5-omni
datamodel_code_generator # for minicpm-4B
jsonschema # for minicpm-4B
verovio>=4.3.1 # For got_ocr2
Expand Down Expand Up @@ -180,6 +181,7 @@ transformers =
eva-decord # For video in VL
jj-pytorchvideo # For CogVLM2-video
qwen-vl-utils!=0.0.9 # For qwen2-vl
qwen_omni_utils # For qwen2.5-omni
datamodel_code_generator # for minicpm-4B
jsonschema # for minicpm-4B
blobfile #for moonlight-16b-a3b
Expand Down
120 changes: 104 additions & 16 deletions xinference/core/chat_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import html
import logging
import os
import tempfile
from io import BytesIO
from typing import Generator, List, Optional

Expand Down Expand Up @@ -66,7 +67,7 @@ def __init__(

def build(self) -> "gr.Blocks":
if "vision" in self.model_ability:
interface = self.build_chat_vl_interface()
interface = self.build_chat_multimodel_interface()
elif "chat" in self.model_ability:
interface = self.build_chat_interface()
else:
Expand Down Expand Up @@ -330,7 +331,7 @@ def generate_wrapper(

return chat_interface

def build_chat_vl_interface(
def build_chat_multimodel_interface(
self,
) -> "gr.Blocks":
def predict(history, bot, max_tokens, temperature, stream):
Expand Down Expand Up @@ -377,11 +378,46 @@ def predict(history, bot, max_tokens, temperature, stream):
},
)
history.append(response["choices"][0]["message"])
bot[-1][1] = history[-1]["content"]
yield history, bot
if "audio" in history[-1]:
# audio output
audio_bytes = base64.b64decode(history[-1]["audio"]["data"])
audio_file = tempfile.NamedTemporaryFile(
delete=False, suffix=".wav"
)
audio_file.write(audio_bytes)
audio_file.close()

def audio_to_base64(audio_path):
with open(audio_path, "rb") as audio_file:
return base64.b64encode(audio_file.read()).decode("utf-8")

def generate_html_audio(audio_path):
base64_audio = audio_to_base64(audio_path)
audio_format = audio_path.split(".")[-1]
return (
f"<audio controls style='max-width:100%;'>"
f"<source src='data:audio/{audio_format};base64,{base64_audio}' type='audio/{audio_format}'>"
f"Your browser does not support the audio tag.</audio>"
)

bot[-1] = (bot[-1][0], history[-1]["content"])
yield history, bot

def add_text(history, bot, text, image, video):
logger.debug("Add text, text: %s, image: %s, video: %s", text, image, video)
# append html audio tag instead of gr.Audio
bot.append((None, generate_html_audio(audio_file.name)))
yield history, bot
else:
bot[-1][1] = history[-1]["content"]
yield history, bot

def add_text(history, bot, text, image, video, audio):
logger.debug(
"Add text, text: %s, image: %s, video: %s, audio: %s",
text,
image,
video,
audio,
)
if image:
buffered = BytesIO()
with PIL.Image.open(image) as img:
Expand Down Expand Up @@ -432,20 +468,54 @@ def generate_html_video(video_path):
},
],
}

elif audio:

def audio_to_base64(audio_path):
with open(audio_path, "rb") as audio_file:
encoded_string = base64.b64encode(audio_file.read()).decode(
"utf-8"
)
return encoded_string

def generate_html_audio(audio_path):
base64_audio = audio_to_base64(audio_path)
audio_format = audio_path.split(".")[-1]
return (
f"<audio controls style='max-width:100%;'>"
f"<source src='data:audio/{audio_format};base64,{base64_audio}' type='audio/{audio_format}'>"
f"Your browser does not support the audio tag.</audio>"
)

display_content = f"{generate_html_audio(audio)}<br>{text}"
message = {
"role": "user",
"content": [
{"type": "text", "text": text},
{
"type": "audio_url",
"audio_url": {"url": audio},
},
],
}

else:
display_content = text
message = {"role": "user", "content": text}
history = history + [message]
bot = bot + [[display_content, None]]
return history, bot, "", None, None
return history, bot, "", None, None, None

def clear_history():
logger.debug("Clear history.")
return [], None, "", None, None
return [], None, "", None, None, None

def update_button(text):
return gr.update(interactive=bool(text))

has_vision = "vision" in self.model_ability
has_audio = "audio" in self.model_ability

with gr.Blocks(
title=f"🚀 Xinference Chat Bot : {self.model_name} 🚀",
css="""
Expand Down Expand Up @@ -484,11 +554,29 @@ def update_button(text):
state = gr.State([])
with gr.Row():
chatbot = gr.Chatbot(
elem_id="chatbot", label=self.model_name, height=700, scale=7
elem_id="chatbot", label=self.model_name, scale=7, min_height=900
)
with gr.Column(scale=3):
imagebox = gr.Image(type="filepath")
videobox = gr.Video()
if has_vision:
imagebox = gr.Image(type="filepath")
videobox = gr.Video()
else:
imagebox = gr.Image(type="filepath", visible=False)
videobox = gr.Video(visible=False)

if has_audio:
audiobox = gr.Audio(
sources=["microphone", "upload"],
type="filepath",
visible=True,
)
else:
audiobox = gr.Audio(
sources=["microphone", "upload"],
type="filepath",
visible=False,
)

textbox = gr.Textbox(
show_label=False,
placeholder="Enter text and press ENTER",
Expand Down Expand Up @@ -516,8 +604,8 @@ def update_button(text):

textbox.submit(
add_text,
[state, chatbot, textbox, imagebox, videobox],
[state, chatbot, textbox, imagebox, videobox],
[state, chatbot, textbox, imagebox, videobox, audiobox],
[state, chatbot, textbox, imagebox, videobox, audiobox],
queue=False,
).then(
predict,
Expand All @@ -527,8 +615,8 @@ def update_button(text):

submit_btn.click(
add_text,
[state, chatbot, textbox, imagebox, videobox],
[state, chatbot, textbox, imagebox, videobox],
[state, chatbot, textbox, imagebox, videobox, audiobox],
[state, chatbot, textbox, imagebox, videobox, audiobox],
queue=False,
).then(
predict,
Expand All @@ -539,7 +627,7 @@ def update_button(text):
clear_btn.click(
clear_history,
None,
[state, chatbot, textbox, imagebox, videobox],
[state, chatbot, textbox, imagebox, videobox, audiobox],
queue=False,
)

Expand Down
1 change: 1 addition & 0 deletions xinference/deploy/docker/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ misaki[en,ja,zh]>=0.7.15 # Kokoro
en_core_web_trf@https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl # Kokoro misaki[en]
en_core_web_sm@https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl # Kokoro misaki[en]
qwen-vl-utils!=0.0.9 # For qwen2-vl
qwen_omni_utils # For qwen2.5-omni
datamodel_code_generator # for minicpm-4B
jsonschema # for minicpm-4B
deepcache # for sd
Expand Down
1 change: 1 addition & 0 deletions xinference/deploy/docker/requirements_cpu.txt
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ misaki[en,ja,zh]>=0.7.15 # Kokoro
en_core_web_trf@https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl # Kokoro misaki[en]
en_core_web_sm@https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl # Kokoro misaki[en]
qwen-vl-utils!=0.0.9 # For qwen2-vl
qwen_omni_utils # For qwen2.5-omni
datamodel_code_generator # for minicpm-4B
jsonschema # for minicpm-4B
verovio>=4.3.1 # For got_ocr2
Expand Down
41 changes: 41 additions & 0 deletions xinference/model/llm/llm_family.json
Original file line number Diff line number Diff line change
Expand Up @@ -7909,6 +7909,47 @@
"<|endoftext|>"
]
},
{
"version":1,
"context_length":32768,
"model_name":"qwen2.5-omni",
"model_lang":[
"en",
"zh"
],
"model_ability":[
"chat",
"vision",
"audio",
"omni"
],
"model_description":"Qwen2.5-Omni: the new flagship end-to-end multimodal model in the Qwen series.",
"model_specs":[
{
"model_format":"pytorch",
"model_size_in_billions":7,
"quantizations":[
"none"
],
"model_id":"Qwen/Qwen2.5-Omni-7B"
}
],
"chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
"stop_token_ids": [
151645,
151643
],
"stop": [
"<|im_end|>",
"<|endoftext|>"
],
"virtualenv": {
"packages": [
"git+https://github.com/huggingface/[email protected]",
"numpy==1.26.4"
]
}
},
{
"version": 1,
"context_length": 32768,
Expand Down
4 changes: 3 additions & 1 deletion xinference/model/llm/llm_family.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,9 @@ class LLMFamilyV1(BaseModel):
model_name: str
model_lang: List[str]
model_ability: List[
Literal["embed", "generate", "chat", "tools", "vision", "audio", "reasoning"]
Literal[
"embed", "generate", "chat", "tools", "vision", "audio", "omni", "reasoning"
]
]
model_description: Optional[str]
# reason for not required str here: legacy registration
Expand Down
42 changes: 42 additions & 0 deletions xinference/model/llm/llm_family_modelscope.json
Original file line number Diff line number Diff line change
Expand Up @@ -5666,6 +5666,48 @@
"<|endoftext|>"
]
},
{
"version":1,
"context_length":32768,
"model_name":"qwen2.5-omni",
"model_lang":[
"en",
"zh"
],
"model_ability":[
"chat",
"vision",
"audio",
"omni"
],
"model_description":"Qwen2.5-Omni: the new flagship end-to-end multimodal model in the Qwen series.",
"model_specs":[
{
"model_format":"pytorch",
"model_size_in_billions":7,
"quantizations":[
"none"
],
"model_hub": "modelscope",
"model_id":"Qwen/Qwen2.5-Omni-7B"
}
],
"chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
"stop_token_ids": [
151645,
151643
],
"stop": [
"<|im_end|>",
"<|endoftext|>"
],
"virtualenv": {
"packages": [
"git+https://github.com/huggingface/[email protected]",
"numpy==1.26.4"
]
}
},
{
"version": 1,
"context_length": 32768,
Expand Down
Loading
Loading