14
14
from grpc ._cython .cygrpc import AbortError
15
15
from grpc_health .v1 import health , health_pb2 , health_pb2_grpc
16
16
from grpc_reflection .v1alpha import reflection
17
- from vllm .engine .async_llm_engine import AsyncLLMEngine
18
17
from vllm .engine .multiprocessing import MQEngineDeadError
19
18
from vllm .entrypoints .openai .serving_completion import merge_async_iterators
20
19
from vllm .inputs import TokensPrompt , token_inputs
34
33
ExpDecayLengthPenaltyWarper ,
35
34
TypicalLogitsWarperWrapper ,
36
35
)
37
- from vllm_tgis_adapter .tgis_utils .metrics import (
38
- FailureReasonLabel ,
39
- ServiceMetrics ,
40
- TGISStatLogger ,
41
- )
42
36
from vllm_tgis_adapter .utils import to_list
43
37
44
38
from .adapters import AdapterStore , validate_adapters
84
78
_F = TypeVar ("_F" , Callable , Coroutine )
85
79
86
80
logger = init_logger (__name__ )
87
- service_metrics = ServiceMetrics ()
88
81
89
82
ADD_SPECIAL_TOKENS : bool = os .getenv ("ADD_SPECIAL_TOKENS" , "true" ).lower () not in (
90
83
"0" ,
@@ -110,8 +103,6 @@ async def _handle_exception(
110
103
** kwargs : dict [str , Any ],
111
104
) -> None :
112
105
context : ServicerContext = kwargs .get ("context" ) or args [- 1 ]
113
- is_generate_fn = "generate" in func .__name__ .lower ()
114
-
115
106
# self.engine on the servicer
116
107
engine = args [0 ].engine
117
108
# If the engine has died, then the server cannot process any further
@@ -132,13 +123,8 @@ async def _handle_exception(
132
123
133
124
if isinstance (e , OutOfMemoryError ):
134
125
logger .exception ("%s caused GPU OOM error" , func .__name__ )
135
- service_metrics .count_request_failure (FailureReasonLabel .OOM )
136
126
await context .abort (StatusCode .RESOURCE_EXHAUSTED , str (e ))
137
- elif is_generate_fn :
138
- service_metrics .count_request_failure (FailureReasonLabel .GENERATE )
139
- else :
140
- service_metrics .count_request_failure (FailureReasonLabel .UNKNOWN )
141
- if isinstance (e , MQEngineDeadError ):
127
+ elif isinstance (e , MQEngineDeadError ):
142
128
logger .error (e )
143
129
return
144
130
logger .exception ("%s failed" , func .__name__ )
@@ -201,20 +187,6 @@ def __init__(
201
187
202
188
async def post_init (self ) -> None :
203
189
self .config = await self .engine .get_model_config ()
204
-
205
- if not isinstance (self .engine , AsyncLLMEngine ):
206
- logger .warning (
207
- "TGIS Metrics currently disabled in decoupled front-end mode, "
208
- "set DISABLE_FRONTEND_MULTIPROCESSING=True to enable"
209
- )
210
- else :
211
- # Swap in the special TGIS stats logger
212
- tgis_stats_logger = TGISStatLogger (
213
- vllm_stat_logger = self .engine .engine .stat_loggers ["prometheus" ],
214
- max_sequence_len = self .config .max_model_len ,
215
- )
216
- self .engine .engine .stat_loggers ["prometheus" ] = tgis_stats_logger
217
-
218
190
self .health_servicer .set (
219
191
self .SERVICE_NAME ,
220
192
health_pb2 .HealthCheckResponse .SERVING ,
@@ -248,8 +220,6 @@ async def Generate(
248
220
request : BatchedGenerationRequest ,
249
221
context : ServicerContext ,
250
222
) -> BatchedGenerationResponse :
251
- start_time = time .time ()
252
- service_metrics .count_generate_request (len (request .requests ))
253
223
request_id = self .request_id (context )
254
224
kwargs = await self ._validate_adapters (
255
225
request ,
@@ -302,7 +272,6 @@ async def Generate(
302
272
if res .prompt is None :
303
273
res .prompt = request .requests [i ].text
304
274
responses [i ] = res
305
- service_metrics .observe_queue_time (res )
306
275
307
276
if (
308
277
deadline is not None
@@ -328,19 +297,16 @@ async def Generate(
328
297
response = self ._convert_input_details (
329
298
res , resp_options , sampling_params , response , tokenizer
330
299
)
331
- service_metrics .observe_generation_success (start_time = start_time )
332
300
responses [i ] = response
333
301
334
302
return BatchedGenerationResponse (responses = responses )
335
303
336
304
@log_rpc_handler_errors
337
- async def GenerateStream ( # noqa: PLR0915, C901
305
+ async def GenerateStream ( # noqa: C901
338
306
self ,
339
307
request : SingleGenerationRequest ,
340
308
context : ServicerContext ,
341
309
) -> AsyncIterator [GenerationResponse ]:
342
- start_time = time .time ()
343
- service_metrics .count_generate_request ()
344
310
request_id = self .request_id (context )
345
311
adapter_kwargs = await self ._validate_adapters (
346
312
request ,
@@ -395,9 +361,6 @@ async def GenerateStream( # noqa: PLR0915, C901
395
361
if first_response is None or (
396
362
result .prompt_token_ids and not generated_token_count
397
363
):
398
- if first_response is None :
399
- service_metrics .observe_queue_time (result )
400
-
401
364
if result .prompt is None :
402
365
result .prompt = request .request .text
403
366
@@ -453,7 +416,6 @@ async def GenerateStream( # noqa: PLR0915, C901
453
416
first_response .stop_reason = last_response .stop_reason
454
417
first_response .stop_sequence = last_response .stop_sequence
455
418
first_response .generated_token_count = last_response .generated_token_count
456
- service_metrics .observe_generation_success (start_time = start_time )
457
419
458
420
def _convert_input_details (
459
421
self ,
@@ -544,7 +506,6 @@ async def _validate_and_convert_params(
544
506
try :
545
507
validate_params (params , self .max_max_new_tokens )
546
508
except ValueError as tgis_validation_error :
547
- service_metrics .count_request_failure (FailureReasonLabel .VALIDATION )
548
509
await context .abort (StatusCode .INVALID_ARGUMENT , str (tgis_validation_error ))
549
510
550
511
resp_options = params .response
@@ -650,7 +611,6 @@ async def _validate_and_convert_params(
650
611
except ValueError as vllm_validation_error :
651
612
# There may be validation cases caught by vLLM that are not covered
652
613
# by the TGIS api validation
653
- service_metrics .count_request_failure (FailureReasonLabel .VALIDATION )
654
614
await context .abort (StatusCode .INVALID_ARGUMENT , str (vllm_validation_error ))
655
615
656
616
return sampling_params , deadline
@@ -670,7 +630,6 @@ async def _validate_adapters(
670
630
vllm_model_handler = vllm_model_handler ,
671
631
)
672
632
except ValueError as e :
673
- service_metrics .count_request_failure (FailureReasonLabel .VALIDATION )
674
633
await context .abort (StatusCode .INVALID_ARGUMENT , str (e ))
675
634
return adapters
676
635
@@ -844,9 +803,6 @@ async def Tokenize(
844
803
tokenized results.
845
804
846
805
"""
847
- # Log the incoming tokenization request for metrics
848
- service_metrics .count_tokenization_request (request )
849
-
850
806
# TODO simplify to only check for lora adapter
851
807
adapter_kwargs = await self ._validate_adapters (
852
808
request ,
@@ -903,7 +859,6 @@ async def Tokenize(
903
859
)
904
860
905
861
response = BatchedTokenizeResponse (responses = responses )
906
- service_metrics .observe_tokenization_response (response )
907
862
return response
908
863
909
864
@log_rpc_handler_errors
0 commit comments