|
4 | 4 | import asyncio
|
5 | 5 |
|
6 | 6 | from dotenv import load_dotenv
|
7 |
| -from typing import AsyncGenerator |
| 7 | +from typing import AsyncGenerator, Optional |
8 | 8 | import time
|
9 | 9 |
|
10 | 10 | from vllm import AsyncLLMEngine
|
| 11 | +from vllm.entrypoints.logger import RequestLogger |
11 | 12 | from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
12 | 13 | from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
|
13 | 14 | from vllm.entrypoints.openai.protocol import ChatCompletionRequest, CompletionRequest, ErrorResponse
|
@@ -151,17 +152,22 @@ async def _initialize_engines(self):
|
151 | 152 | model_config=self.model_config,
|
152 | 153 | models=self.serving_models,
|
153 | 154 | response_role=self.response_role,
|
| 155 | + request_logger=None, |
154 | 156 | chat_template=self.tokenizer.tokenizer.chat_template,
|
| 157 | + chat_template_content_format="auto", |
| 158 | + enable_reasoning=os.getenv('ENABLE_REASONING', 'false').lower() == 'true', |
| 159 | + reasoning_parser=None, |
| 160 | + return_token_as_token_ids=False, |
155 | 161 | enable_auto_tools=os.getenv('ENABLE_AUTO_TOOL_CHOICE', 'false').lower() == 'true',
|
156 | 162 | tool_parser=os.getenv('TOOL_CALL_PARSER', "") or None,
|
157 |
| - lora_modules=lora_modules, |
158 |
| - chat_template_content_format="auto", |
| 163 | + enable_prompt_tokens_details=False |
159 | 164 | )
|
160 | 165 | self.completion_engine = OpenAIServingCompletion(
|
161 | 166 | engine_client=self.llm,
|
162 | 167 | model_config=self.model_config,
|
163 | 168 | models=self.serving_models,
|
164 |
| - lora_modules=lora_modules, |
| 169 | + request_logger=None, |
| 170 | + return_token_as_token_ids=False, |
165 | 171 | )
|
166 | 172 |
|
167 | 173 | async def generate(self, openai_request: JobInput):
|
|
0 commit comments