@@ -1427,39 +1427,29 @@ inference request. For example,
1427
1427
import triton_python_backend_utils as pb_utils
1428
1428
1429
1429
class TritonPythonModel :
1430
- ...
1430
+ ...
1431
1431
def execute (self , requests ):
1432
- ...
1433
- infer_request = pb_utils.InferenceRequest(
1434
- model_name = ' model_name' ,
1435
- requested_output_names = [' REQUESTED_OUTPUT' ],
1436
- inputs = [< pb_utils.Tensor object > ])
1432
+ ...
1433
+ bls_response_iterator = bls_request.exec(decoupled = True )
1434
+ ...
1435
+ bls_response_iterator.cancel()
1436
+ ...
1437
+ ```
1437
1438
1438
- # Execute the infer_request and wait for the response. Here we are
1439
- # running a BLS request on a decoupled model, hence setting the parameter
1440
- # 'decoupled' to 'True'.
1441
- infer_responses = infer_request.exec(decoupled = True )
1439
+ You may also call the ` cancel() ` method on the response iterator returned from
1440
+ the ` async_exec() ` method of the inference request. For example,
1442
1441
1443
- response_tensors_received = []
1444
- for infer_response in infer_responses:
1445
- # Check if the inference response indicates an error.
1446
- # vLLM backend uses the CANCELLED error code when a request is cancelled.
1447
- # TensorRT-LLM backend does not use error codes; instead, it sends the
1448
- # TRITONSERVER_RESPONSE_COMPLETE_FINAL flag to the iterator.
1449
- if infer_response.has_error():
1450
- if infer_response.error().code() == pb_utils.TritonError.CANCELLED :
1451
- print (" request has been cancelled." )
1452
- break
1453
-
1454
- # Collect the output tensor from the model's response
1455
- output = pb_utils.get_output_tensor_by_name(
1456
- infer_response, ' REQUESTED_OUTPUT' )
1457
- response_tensors_received.append(output)
1458
-
1459
- # Check if we have received enough inference output tensors
1460
- # and then cancel the response iterator
1461
- if has_enough_response(response_tensors_received):
1462
- infer_responses.cancel()
1442
+ ``` python
1443
+ import triton_python_backend_utils as pb_utils
1444
+
1445
+ class TritonPythonModel :
1446
+ ...
1447
+ async def execute (self , requests ):
1448
+ ...
1449
+ bls_response_iterator = await bls_request.async_exec(decoupled = True )
1450
+ ...
1451
+ bls_response_iterator.cancel()
1452
+ ...
1463
1453
```
1464
1454
1465
1455
Note: Whether the decoupled model returns a cancellation error and stops executing
0 commit comments