Skip to content

Commit 3ac7505

Browse files
committed
responses cancellation
adjust the comments title add finished check rename function adjust comment comment comment fix fix fix readme fix comments fix leak
1 parent 1b797d6 commit 3ac7505

18 files changed

+410
-19
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -140,3 +140,4 @@ dmypy.json
140140

141141
# vscode
142142
.vscode/settings.json
143+
.vscode/c_cpp_properties.json

CMakeLists.txt

+2
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,8 @@ set(
241241
src/pb_response_iterator.cc
242242
src/pb_cancel.cc
243243
src/pb_cancel.h
244+
src/pb_bls_cancel.cc
245+
src/pb_bls_cancel.h
244246
)
245247

246248
list(APPEND

README.md

+53-2
Original file line numberDiff line numberDiff line change
@@ -1409,14 +1409,65 @@ class TritonPythonModel:
14091409
A complete example for sync and async BLS for decoupled models is included in
14101410
the [Examples](#examples) section.
14111411

1412+
Note: Async BLS is not supported on Python 3.6 or lower due to the `async`
1413+
keyword and `asyncio.run` being introduced in Python 3.7.
1414+
14121415
Starting from the 22.04 release, the lifetime of the BLS output tensors have
14131416
been improved such that if a tensor is no longer needed in your Python model it
14141417
will be automatically deallocated. This can increase the number of BLS requests
14151418
that you can execute in your model without running into the out of GPU or
14161419
shared memory error.
14171420

1418-
Note: Async BLS is not supported on Python 3.6 or lower due to the `async`
1419-
keyword and `asyncio.run` being introduced in Python 3.7.
1421+
Starting from the 25.04 release, you can use the `infer_responses.cancel()` function
1422+
on a BLS decoupled response iterator to stop the response stream, which cancels
1423+
the request to the decoupled model. This is useful for stopping long inference
1424+
requests, such as those from auto-generative large language models, which may
1425+
run for an indeterminate amount of time and consume significant server resources.
1426+
The response iterator can be generated from `infer_request.exec(decoupled=True)`
1427+
and `infer_request.async_exec(decoupled=True)` functions:
1428+
1429+
```python
1430+
import triton_python_backend_utils as pb_utils
1431+
1432+
class TritonPythonModel:
1433+
...
1434+
def execute(self, requests):
1435+
...
1436+
inference_request = pb_utils.InferenceRequest(
1437+
model_name='model_name',
1438+
requested_output_names=['REQUESTED_OUTPUT'],
1439+
inputs=[<pb_utils.Tensor object>])
1440+
1441+
# Execute the inference_request and wait for the response. Here we are
1442+
# running a BLS request on a decoupled model, hence setting the parameter
1443+
# 'decoupled' to 'True'.
1444+
infer_responses = infer_request.exec(decoupled=True)
1445+
1446+
response_tensors_received = []
1447+
for infer_response in infer_responses:
1448+
# Check if the inference response indicates an error.
1449+
# vLLM backend uses the CANCELLED error code when a request is cancelled.
1450+
# TensorRT-LLM backend does not use error codes; instead, it sends the
1451+
# TRITONSERVER_RESPONSE_COMPLETE_FINAL flag to the iterator.
1452+
if inference_response.has_error():
1453+
if infer_response.error().code() == pb_utils.TritonError.CANCELLED:
1454+
print("request has been cancelled.")
1455+
break
1456+
1457+
# Collect the output tensor from the model's response
1458+
output = pb_utils.get_output_tensor_by_name(
1459+
inference_response, 'REQUESTED_OUTPUT')
1460+
response_tensors_received.append(output)
1461+
1462+
# Check if we have received enough inference output tensors
1463+
# and then cancel the response iterator
1464+
if has_enough_response(response_tensors_received):
1465+
infer_responses.cancel()
1466+
```
1467+
1468+
Note: Whether the decoupled model returns a cancellation error and stops executing
1469+
the request depends on the model's backend implementation. Please refer to the
1470+
documentation for more details [Handing in Backend](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/request_cancellation.md#handling-in-backend)
14201471

14211472
## Model Loading API
14221473

src/infer_payload.cc

+15-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Redistribution and use in source and binary forms, with or without
44
// modification, are permitted provided that the following conditions
@@ -31,7 +31,8 @@ namespace triton { namespace backend { namespace python {
3131
InferPayload::InferPayload(
3232
const bool is_decoupled,
3333
std::function<void(std::unique_ptr<InferResponse>)> callback)
34-
: is_decoupled_(is_decoupled), is_promise_set_(false), callback_(callback)
34+
: is_decoupled_(is_decoupled), is_promise_set_(false), callback_(callback),
35+
request_address_(reinterpret_cast<intptr_t>(nullptr))
3536
{
3637
promise_.reset(new std::promise<std::unique_ptr<InferResponse>>());
3738
}
@@ -91,4 +92,16 @@ InferPayload::ResponseAllocUserp()
9192
return response_alloc_userp_;
9293
}
9394

95+
void
96+
InferPayload::SetRequestAddress(intptr_t request_address)
97+
{
98+
request_address_ = request_address;
99+
}
100+
101+
intptr_t
102+
InferPayload::GetRequestAddress()
103+
{
104+
return request_address_;
105+
}
106+
94107
}}} // namespace triton::backend::python

src/infer_payload.h

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Redistribution and use in source and binary forms, with or without
44
// modification, are permitted provided that the following conditions
@@ -62,6 +62,8 @@ class InferPayload : public std::enable_shared_from_this<InferPayload> {
6262
void SetResponseAllocUserp(
6363
const ResponseAllocatorUserp& response_alloc_userp);
6464
std::shared_ptr<ResponseAllocatorUserp> ResponseAllocUserp();
65+
void SetRequestAddress(intptr_t request_address);
66+
intptr_t GetRequestAddress();
6567

6668
private:
6769
std::unique_ptr<std::promise<std::unique_ptr<InferResponse>>> promise_;
@@ -70,6 +72,7 @@ class InferPayload : public std::enable_shared_from_this<InferPayload> {
7072
bool is_promise_set_;
7173
std::function<void(std::unique_ptr<InferResponse>)> callback_;
7274
std::shared_ptr<ResponseAllocatorUserp> response_alloc_userp_;
75+
intptr_t request_address_;
7376
};
7477

7578
}}} // namespace triton::backend::python

src/infer_response.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ InferResponse::SaveToSharedMemory(
9191
response_shm_ptr->is_error_set = false;
9292
shm_handle_ = response_shm_.handle_;
9393
response_shm_ptr->is_last_response = is_last_response_;
94+
response_shm_ptr->id = id_;
9495

9596
// Only save the output tensors to shared memory when the inference response
9697
// doesn't have error.
@@ -113,7 +114,6 @@ InferResponse::SaveToSharedMemory(
113114
tensor_handle_shm_ptr[j] = output_tensor->ShmHandle();
114115
j++;
115116
}
116-
response_shm_ptr->id = id_;
117117

118118
parameters_shm_ = PbString::Create(shm_pool, parameters_);
119119
response_shm_ptr->parameters = parameters_shm_->ShmHandle();

src/ipc_message.h

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Redistribution and use in source and binary forms, with or without
44
// modification, are permitted provided that the following conditions
@@ -67,7 +67,8 @@ typedef enum PYTHONSTUB_commandtype_enum {
6767
PYTHONSTUB_LoadModelRequest,
6868
PYTHONSTUB_UnloadModelRequest,
6969
PYTHONSTUB_ModelReadinessRequest,
70-
PYTHONSTUB_IsRequestCancelled
70+
PYTHONSTUB_IsRequestCancelled,
71+
PYTHONSTUB_CancelBLSDecoupledInferRequest
7172
} PYTHONSTUB_CommandType;
7273

7374
///

src/pb_bls_cancel.cc

+92
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
// Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
//
3+
// Redistribution and use in source and binary forms, with or without
4+
// modification, are permitted provided that the following conditions
5+
// are met:
6+
// * Redistributions of source code must retain the above copyright
7+
// notice, this list of conditions and the following disclaimer.
8+
// * Redistributions in binary form must reproduce the above copyright
9+
// notice, this list of conditions and the following disclaimer in the
10+
// documentation and/or other materials provided with the distribution.
11+
// * Neither the name of NVIDIA CORPORATION nor the names of its
12+
// contributors may be used to endorse or promote products derived
13+
// from this software without specific prior written permission.
14+
//
15+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16+
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18+
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19+
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20+
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21+
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22+
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23+
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24+
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
27+
#include "pb_bls_cancel.h"
28+
29+
#include "pb_stub.h"
30+
31+
namespace triton { namespace backend { namespace python {
32+
33+
void
34+
PbBLSCancel::SaveToSharedMemory(std::unique_ptr<SharedMemoryManager>& shm_pool)
35+
{
36+
cancel_shm_ = shm_pool->Construct<CancelBLSRequestMessage>();
37+
new (&(cancel_shm_.data_->mu)) bi::interprocess_mutex;
38+
new (&(cancel_shm_.data_->cv)) bi::interprocess_condition;
39+
cancel_shm_.data_->waiting_on_stub = false;
40+
cancel_shm_.data_->infer_payload_id = infer_playload_id_;
41+
cancel_shm_.data_->is_cancelled = is_cancelled_;
42+
}
43+
44+
bi::managed_external_buffer::handle_t
45+
PbBLSCancel::ShmHandle()
46+
{
47+
return cancel_shm_.handle_;
48+
}
49+
50+
CancelBLSRequestMessage*
51+
PbBLSCancel::ShmPayload()
52+
{
53+
return cancel_shm_.data_.get();
54+
}
55+
56+
void
57+
PbBLSCancel::Cancel()
58+
{
59+
// Release the GIL. Python objects are not accessed during the check.
60+
py::gil_scoped_release gil_release;
61+
62+
std::unique_lock<std::mutex> lk(mu_);
63+
// The cancelled flag can only move from false to true, not the other way, so
64+
// it is checked on each query until cancelled and then implicitly cached.
65+
if (is_cancelled_) {
66+
return;
67+
}
68+
if (!updating_) {
69+
std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
70+
if (!stub->StubToParentServiceActive()) {
71+
LOG_ERROR << "Cannot communicate with parent service";
72+
return;
73+
}
74+
75+
stub->EnqueueCancelBLSDecoupledRequest(this);
76+
updating_ = true;
77+
}
78+
cv_.wait(lk, [this] { return !updating_; });
79+
}
80+
81+
void
82+
PbBLSCancel::ReportIsCancelled(bool is_cancelled)
83+
{
84+
{
85+
std::lock_guard<std::mutex> lk(mu_);
86+
is_cancelled_ = is_cancelled;
87+
updating_ = false;
88+
}
89+
cv_.notify_all();
90+
}
91+
92+
}}} // namespace triton::backend::python

src/pb_bls_cancel.h

+63
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
// Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
//
3+
// Redistribution and use in source and binary forms, with or without
4+
// modification, are permitted provided that the following conditions
5+
// are met:
6+
// * Redistributions of source code must retain the above copyright
7+
// notice, this list of conditions and the following disclaimer.
8+
// * Redistributions in binary form must reproduce the above copyright
9+
// notice, this list of conditions and the following disclaimer in the
10+
// documentation and/or other materials provided with the distribution.
11+
// * Neither the name of NVIDIA CORPORATION nor the names of its
12+
// contributors may be used to endorse or promote products derived
13+
// from this software without specific prior written permission.
14+
//
15+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16+
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18+
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19+
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20+
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21+
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22+
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23+
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24+
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
27+
#pragma once
28+
29+
#include <condition_variable>
30+
#include <mutex>
31+
32+
#include "pb_utils.h"
33+
34+
namespace triton { namespace backend { namespace python {
35+
36+
class PbBLSCancel {
37+
public:
38+
PbBLSCancel(void* infer_playload_id)
39+
: updating_(false), infer_playload_id_(infer_playload_id),
40+
is_cancelled_(false)
41+
{
42+
}
43+
DISALLOW_COPY_AND_ASSIGN(PbBLSCancel);
44+
45+
void SaveToSharedMemory(std::unique_ptr<SharedMemoryManager>& shm_pool);
46+
bi::managed_external_buffer::handle_t ShmHandle();
47+
CancelBLSRequestMessage* ShmPayload();
48+
49+
void Cancel();
50+
void ReportIsCancelled(bool is_cancelled);
51+
52+
private:
53+
AllocatedSharedMemory<CancelBLSRequestMessage> cancel_shm_;
54+
55+
std::mutex mu_;
56+
std::condition_variable cv_;
57+
bool updating_;
58+
59+
void* infer_playload_id_;
60+
bool is_cancelled_;
61+
};
62+
63+
}}}; // namespace triton::backend::python

src/pb_response_iterator.cc

+10-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Redistribution and use in source and binary forms, with or without
44
// modification, are permitted provided that the following conditions
@@ -40,6 +40,7 @@ ResponseIterator::ResponseIterator(
4040
: id_(response->Id()), is_finished_(false), is_cleared_(false), idx_(0)
4141
{
4242
response_buffer_.push(response);
43+
pb_bls_cancel_ = std::make_shared<PbBLSCancel>(response->Id());
4344
}
4445

4546
ResponseIterator::~ResponseIterator()
@@ -159,4 +160,12 @@ ResponseIterator::GetExistingResponses()
159160
return responses;
160161
}
161162

163+
void
164+
ResponseIterator::Cancel()
165+
{
166+
if (!is_finished_) {
167+
pb_bls_cancel_->Cancel();
168+
}
169+
}
170+
162171
}}} // namespace triton::backend::python

src/pb_response_iterator.h

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Redistribution and use in source and binary forms, with or without
44
// modification, are permitted provided that the following conditions
@@ -29,6 +29,7 @@
2929
#include <queue>
3030

3131
#include "infer_response.h"
32+
#include "pb_bls_cancel.h"
3233

3334
namespace triton { namespace backend { namespace python {
3435

@@ -43,6 +44,7 @@ class ResponseIterator {
4344
void* Id();
4445
void Clear();
4546
std::vector<std::shared_ptr<InferResponse>> GetExistingResponses();
47+
void Cancel();
4648

4749
private:
4850
std::vector<std::shared_ptr<InferResponse>> responses_;
@@ -53,6 +55,7 @@ class ResponseIterator {
5355
bool is_finished_;
5456
bool is_cleared_;
5557
size_t idx_;
58+
std::shared_ptr<PbBLSCancel> pb_bls_cancel_;
5659
};
5760

5861
}}} // namespace triton::backend::python

0 commit comments

Comments
 (0)