diff --git a/src/infer_request.cc b/src/infer_request.cc index 3ecde9e8..5fdae669 100644 --- a/src/infer_request.cc +++ b/src/infer_request.cc @@ -44,12 +44,13 @@ InferRequest::InferRequest( const std::string& model_name, const int64_t model_version, const std::string& parameters, const uint32_t flags, const int32_t timeout, const intptr_t response_factory_address, const intptr_t request_address, - const PreferredMemory& preferred_memory) + const PreferredMemory& preferred_memory, const InferenceTrace& trace) : request_id_(request_id), correlation_id_(correlation_id), inputs_(inputs), requested_output_names_(requested_output_names), model_name_(model_name), model_version_(model_version), parameters_(parameters), flags_(flags), timeout_(timeout), response_factory_address_(response_factory_address), - request_address_(request_address), preferred_memory_(preferred_memory) + request_address_(request_address), preferred_memory_(preferred_memory), + trace_(trace) { for (auto& input : inputs) { if (!input) { @@ -166,6 +167,12 @@ InferRequest::GetPreferredMemory() return preferred_memory_; } +InferenceTrace& +InferRequest::Trace() +{ + return trace_; +} + void InferRequest::SaveToSharedMemory(std::unique_ptr& shm_pool) { @@ -191,6 +198,7 @@ InferRequest::SaveToSharedMemory(std::unique_ptr& shm_pool) infer_request_shm_ptr_->is_decoupled = is_decoupled_; infer_request_shm_ptr_->timeout = timeout_; infer_request_shm_ptr_->preferred_memory = preferred_memory_; + infer_request_shm_ptr_->trace = trace_; output_names_handle_shm_ptr_ = reinterpret_cast( @@ -368,6 +376,7 @@ InferRequest::InferRequest( is_decoupled_ = infer_request_shm_ptr_->is_decoupled; timeout_ = infer_request_shm_ptr_->timeout; preferred_memory_ = infer_request_shm_ptr_->preferred_memory; + trace_ = infer_request_shm_ptr_->trace; #ifdef TRITON_PB_STUB response_sender_ = std::make_shared( diff --git a/src/infer_request.h b/src/infer_request.h index 7eb2fd88..7ef3a363 100644 --- a/src/infer_request.h +++ b/src/infer_request.h @@ -41,6 +41,17 @@ namespace triton { namespace backend { namespace python { class Stub; +// +// Inference Trace +// +struct InferenceTrace { +#ifndef TRITON_PB_STUB + TRITONSERVER_InferenceTrace* triton_trace_; +#else + void* triton_trace_; +#endif +}; + // // Inference Request // @@ -55,6 +66,7 @@ struct InferRequestShm { bool is_decoupled; int32_t timeout; PreferredMemory preferred_memory; + InferenceTrace trace; }; class InferRequest { @@ -68,7 +80,8 @@ class InferRequest { const int32_t timeout = 0, const intptr_t response_factory_address = 0, const intptr_t request_address = 0, const PreferredMemory& preferred_memory = - PreferredMemory(PreferredMemory::DEFAULT, 0)); + PreferredMemory(PreferredMemory::DEFAULT, 0), + const InferenceTrace& trace = {.triton_trace_ = nullptr}); const std::vector>& Inputs(); const std::string& RequestId(); @@ -84,6 +97,7 @@ class InferRequest { bool IsDecoupled(); void SetIsDecoupled(const bool is_decoupled); PreferredMemory& GetPreferredMemory(); + InferenceTrace& Trace(); #ifdef TRITON_PB_STUB std::shared_ptr Exec(const bool is_decoupled); @@ -139,6 +153,7 @@ class InferRequest { intptr_t request_address_; bool is_decoupled_; PreferredMemory preferred_memory_; + InferenceTrace trace_; // Shared Memory Data Structures AllocatedSharedMemory infer_request_shm_; diff --git a/src/pb_stub.cc b/src/pb_stub.cc index eb561dec..b7df94c6 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -1362,6 +1362,9 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) .value("TRITONSERVER_MEMORY_CPU", PreferredMemory::MemoryType::CPU) .export_values(); + py::class_>( + module, "InferenceTrace"); + py::class_>( module, "InferenceRequest") .def( @@ -1371,7 +1374,8 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) const std::string& model_name, const int64_t model_version, const uint32_t flags, const int32_t timeout, - const PreferredMemory& preferred_memory) { + const PreferredMemory& preferred_memory, + const InferenceTrace& trace) { std::set requested_outputs; for (auto& requested_output_name : requested_output_names) { requested_outputs.emplace(requested_output_name); @@ -1381,7 +1385,7 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) request_id, correlation_id, inputs, requested_outputs, model_name, model_version, "" /*parameters*/, flags, timeout, 0 /*response_factory_address*/, 0 /*request_address*/, - preferred_memory); + preferred_memory, trace); }), py::arg("request_id").none(false) = "", py::arg("correlation_id").none(false) = 0, @@ -1391,7 +1395,8 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) py::arg("model_version").none(false) = -1, py::arg("flags").none(false) = 0, py::arg("timeout").none(false) = 0, py::arg("preferred_memory").none(false) = - PreferredMemory(PreferredMemory::DEFAULT, 0)) + PreferredMemory(PreferredMemory::DEFAULT, 0), + py::arg("trace").none(false) = nullptr) .def( "inputs", &InferRequest::Inputs, py::return_value_policy::reference_internal) @@ -1401,6 +1406,7 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) .def("set_flags", &InferRequest::SetFlags) .def("timeout", &InferRequest::Timeout) .def("parameters", &InferRequest::Parameters) + .def("trace", &InferRequest::Trace) .def( "exec", [](std::shared_ptr& infer_request, diff --git a/src/python_be.cc b/src/python_be.cc index 6f25e024..bb2c4e49 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -364,6 +364,11 @@ ModelInstanceState::SaveRequestsToSharedMemory( uint32_t flags; RETURN_IF_ERROR(TRITONBACKEND_RequestFlags(request, &flags)); + TRITONSERVER_InferenceTrace* triton_trace; + RETURN_IF_ERROR(TRITONBACKEND_RequestTrace(request, &triton_trace)); + + InferenceTrace trace = {triton_trace}; + std::unique_ptr infer_request; if (model_state->IsDecoupled()) { TRITONBACKEND_ResponseFactory* factory_ptr; @@ -372,13 +377,15 @@ ModelInstanceState::SaveRequestsToSharedMemory( id, correlation_id, pb_input_tensors, requested_output_names, model_state->Name(), model_state->Version(), parameters_string, flags, 0 /* BLS request timeout*/, reinterpret_cast(factory_ptr), - reinterpret_cast(request)); + reinterpret_cast(request), + PreferredMemory(PreferredMemory::DEFAULT, 0), trace); } else { infer_request = std::make_unique( id, correlation_id, pb_input_tensors, requested_output_names, model_state->Name(), model_state->Version(), parameters_string, flags, 0 /* BLS request timeout*/, 0 /* response_factory_address */, - reinterpret_cast(request)); + reinterpret_cast(request), + PreferredMemory(PreferredMemory::DEFAULT, 0), trace); } RETURN_IF_EXCEPTION(infer_request->SaveToSharedMemory(Stub()->ShmPool())); diff --git a/src/request_executor.cc b/src/request_executor.cc index 2590ee37..b54e3988 100644 --- a/src/request_executor.cc +++ b/src/request_executor.cc @@ -359,6 +359,12 @@ RequestExecutor::Infer( THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetReleaseCallback( irequest, InferRequestComplete, nullptr /* request_release_userp */)); + TRITONSERVER_InferenceTrace* trace = nullptr; + if (infer_request->Trace().triton_trace_ != nullptr) { + THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceTraceSpawnChildTrace( + infer_request->Trace().triton_trace_, &trace)); + } + for (auto& infer_input : infer_request->Inputs()) { THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestAddInput( irequest, infer_input->Name().c_str(), @@ -388,8 +394,8 @@ RequestExecutor::Infer( reinterpret_cast(infer_payload->ResponseAllocUserp().get()), InferResponseComplete, reinterpret_cast(infer_payload.get()))); - THROW_IF_TRITON_ERROR(TRITONSERVER_ServerInferAsync( - server_, irequest, nullptr /* trace */)); + THROW_IF_TRITON_ERROR( + TRITONSERVER_ServerInferAsync(server_, irequest, trace)); } } catch (const PythonBackendException& pb_exception) {