Add request input to profile export data (#549)

nv-hwoo · web-flow · commit 7db166fc31e6 · 2024-03-27T19:02:02.000-07:00
* Add request_inputs field in profile export file

* Add request input data to profile export

* Add test cases

* Remove comment

* Add ticket and remove comments

* Add tests

* Remove first 4 bytes

* Add warning

* handle empty input

* Avoid access to buffer when empty
diff --git a/src/c++/library/common.cc b/src/c++/library/common.cc
@@ -182,6 +182,20 @@ InferInput::AppendFromString(const std::vector<std::string>& input)
   return AppendRaw(reinterpret_cast<const uint8_t*>(&sbuf[0]), sbuf.size());
 }
 
+Error
+InferInput::RawData(const uint8_t** buf, size_t* byte_size)
+{
+  if (bufs_.size()) {
+    // TMA-1775 - handle multi-batch case
+    *buf = bufs_[0];
+    *byte_size = buf_byte_sizes_[0];
+  } else {
+    *buf = nullptr;
+    *byte_size = 0;
+  }
+  return Error::Success;
+}
+
 Error
 InferInput::ByteSize(size_t* byte_size) const
 {
diff --git a/src/c++/library/common.h b/src/c++/library/common.h
@@ -334,6 +334,15 @@ class InferInput {
   /// \return Error object indicating success or failure.
   Error AppendFromString(const std::vector<std::string>& input);
 
+  /// Get access to the buffer holding raw input. Note the buffer is owned by
+  /// InferInput instance. Users can copy out the data if required to extend
+  /// the lifetime.
+  /// \param buf Returns the pointer to the start of the buffer.
+  /// \param byte_size Returns the size of buffer in bytes.
+  /// \return Error object indicating success or failure of the
+  /// request.
+  Error RawData(const uint8_t** buf, size_t* byte_size);
+
   /// Gets the size of data added into this input in bytes.
   /// \param byte_size The size of data added in bytes.
   /// \return Error object indicating success or failure.
diff --git a/src/c++/perf_analyzer/client_backend/client_backend.cc b/src/c++/perf_analyzer/client_backend/client_backend.cc
@@ -505,6 +505,14 @@ InferInput::SetSharedMemory(
       pa::GENERIC_ERROR);
 }
 
+Error
+InferInput::RawData(const uint8_t** buf, size_t* byte_size)
+{
+  return Error(
+      "client backend of kind " + BackendKindToString(kind_) +
+          " does not support RawData() for InferInput",
+      pa::GENERIC_ERROR);
+}
 
 InferInput::InferInput(
     const BackendKind kind, const std::string& name,
diff --git a/src/c++/perf_analyzer/client_backend/client_backend.h b/src/c++/perf_analyzer/client_backend/client_backend.h
@@ -558,6 +558,15 @@ class InferInput {
   virtual Error SetSharedMemory(
       const std::string& name, size_t byte_size, size_t offset = 0);
 
+  /// Get access to the buffer holding raw input. Note the buffer is owned by
+  /// InferInput instance. Users can copy out the data if required to extend
+  /// the lifetime.
+  /// \param buf Returns the pointer to the start of the buffer.
+  /// \param byte_size Returns the size of buffer in bytes.
+  /// \return Error object indicating success or failure of the
+  /// request.
+  virtual Error RawData(const uint8_t** buf, size_t* byte_size);
+
  protected:
   InferInput(
       const BackendKind kind, const std::string& name,
diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_infer_input.cc b/src/c++/perf_analyzer/client_backend/openai/openai_infer_input.cc
@@ -71,6 +71,15 @@ OpenAiInferInput::AppendRaw(const uint8_t* input, size_t input_byte_size)
   return Error::Success;
 }
 
+Error
+OpenAiInferInput::RawData(const uint8_t** buf, size_t* byte_size)
+{
+  // TMA-1775 - handle multi-batch case
+  *buf = bufs_[0];
+  *byte_size = buf_byte_sizes_[0];
+  return Error::Success;
+}
+
 Error
 OpenAiInferInput::PrepareForRequest()
 {
diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_infer_input.h b/src/c++/perf_analyzer/client_backend/openai/openai_infer_input.h
@@ -51,6 +51,8 @@ class OpenAiInferInput : public InferInput {
   Error Reset() override;
   /// See InferInput::AppendRaw()
   Error AppendRaw(const uint8_t* input, size_t input_byte_size) override;
+  /// See InferInput::RawData()
+  Error RawData(const uint8_t** buf, size_t* byte_size) override;
   /// Prepare the input to be in the form expected by an OpenAI client,
   /// must call before accessing the data.
   Error PrepareForRequest();
diff --git a/src/c++/perf_analyzer/client_backend/triton/triton_client_backend.cc b/src/c++/perf_analyzer/client_backend/triton/triton_client_backend.cc
@@ -756,6 +756,13 @@ TritonInferInput::SetSharedMemory(
   return Error::Success;
 }
 
+Error
+TritonInferInput::RawData(const uint8_t** buf, size_t* byte_size)
+{
+  RETURN_IF_TRITON_ERROR(input_->RawData(buf, byte_size));
+  return Error::Success;
+}
+
 TritonInferInput::TritonInferInput(
     const std::string& name, const std::string& datatype)
     : InferInput(BackendKind::TRITON, name, datatype)
diff --git a/src/c++/perf_analyzer/client_backend/triton/triton_client_backend.h b/src/c++/perf_analyzer/client_backend/triton/triton_client_backend.h
@@ -283,6 +283,8 @@ class TritonInferInput : public InferInput {
   /// See InferInput::SetSharedMemory()
   Error SetSharedMemory(
       const std::string& name, size_t byte_size, size_t offset = 0) override;
+  /// See InferInput::RawData()
+  Error RawData(const uint8_t** buf, size_t* byte_size) override;
 
  private:
   explicit TritonInferInput(
diff --git a/src/c++/perf_analyzer/infer_context.cc b/src/c++/perf_analyzer/infer_context.cc
@@ -112,6 +112,10 @@ InferContext::SendRequest(
   }
 
   thread_stat_->num_sent_requests_++;
+
+  // Parse the request inputs to save in the profile export file
+  RequestRecord::RequestInput request_inputs{GetInputs()};
+
   if (async_) {
     uint64_t unique_request_id{(thread_id_ << 48) | ((request_id << 16) >> 16)};
     infer_data_.options_->request_id_ = std::to_string(unique_request_id);
@@ -120,6 +124,7 @@ InferContext::SendRequest(
       auto it = async_req_map_
                     .emplace(infer_data_.options_->request_id_, RequestRecord())
                     .first;
+      it->second.request_inputs_ = {request_inputs};
       it->second.start_time_ = std::chrono::system_clock::now();
       it->second.sequence_end_ = infer_data_.options_->sequence_end_;
       it->second.delayed_ = delayed;
@@ -149,10 +154,10 @@ InferContext::SendRequest(
         &results, *(infer_data_.options_), infer_data_.valid_inputs_,
         infer_data_.outputs_);
     thread_stat_->idle_timer.Stop();
-    RequestRecord::ResponseOutput response_output{};
+    RequestRecord::ResponseOutput response_outputs{};
     if (results != nullptr) {
       if (thread_stat_->status_.IsOk()) {
-        response_output = GetOutput(*results);
+        response_outputs = GetOutputs(*results);
         thread_stat_->status_ = ValidateOutputs(results);
       }
       delete results;
@@ -169,8 +174,9 @@ InferContext::SendRequest(
       std::lock_guard<std::mutex> lock(thread_stat_->mu_);
       auto total = end_time_sync - start_time_sync;
       thread_stat_->request_records_.emplace_back(RequestRecord(
-          start_time_sync, std::move(end_time_syncs), {response_output},
-          infer_data_.options_->sequence_end_, delayed, sequence_id, false));
+          start_time_sync, std::move(end_time_syncs), {request_inputs},
+          {response_outputs}, infer_data_.options_->sequence_end_, delayed,
+          sequence_id, false));
       thread_stat_->status_ =
           infer_backend_->ClientInferStat(&(thread_stat_->contexts_stat_[id_]));
       if (!thread_stat_->status_.IsOk()) {
@@ -180,15 +186,36 @@ InferContext::SendRequest(
   }
 }
 
+const RequestRecord::RequestInput
+InferContext::GetInputs()
+{
+  RequestRecord::RequestInput input{};
+  for (const auto& request_input : infer_data_.valid_inputs_) {
+    const uint8_t* buf{nullptr};
+    size_t byte_size{0};
+    std::string data_type{request_input->Datatype()};
+    request_input->RawData(&buf, &byte_size);
+
+    // The first 4 bytes of BYTES data is a 32-bit integer to indicate the size
+    // of the rest of the data (which we already know based on byte_size). It
+    // should be ignored here, as it isn't part of the actual request
+    if (data_type == "BYTES" && byte_size >= 4) {
+      buf += 4;
+      byte_size -= 4;
+    }
+    input.emplace(request_input->Name(), RecordData(buf, byte_size, data_type));
+  }
+  return input;
+}
+
 const RequestRecord::ResponseOutput
-InferContext::GetOutput(const cb::InferResult& infer_result)
+InferContext::GetOutputs(const cb::InferResult& infer_result)
 {
   RequestRecord::ResponseOutput output{};
   for (const auto& requested_output : infer_data_.outputs_) {
     const uint8_t* buf{nullptr};
     size_t byte_size{0};
     infer_result.RawData(requested_output->Name(), &buf, &byte_size);
-
     // The first 4 bytes of BYTES data is a 32-bit integer to indicate the size
     // of the rest of the data (which we already know based on byte_size). It
     // should be ignored here, as it isn't part of the actual response
@@ -282,7 +309,7 @@ InferContext::AsyncCallbackFuncImpl(cb::InferResult* result)
         }
         it->second.response_timestamps_.push_back(
             std::chrono::system_clock::now());
-        it->second.response_outputs_.push_back(GetOutput(*result));
+        it->second.response_outputs_.push_back(GetOutputs(*result));
         num_responses_++;
         if (is_null_response == true) {
           it->second.has_null_last_response_ = true;
@@ -296,9 +323,9 @@ InferContext::AsyncCallbackFuncImpl(cb::InferResult* result)
           has_received_final_response_ = is_final_response;
           thread_stat_->request_records_.emplace_back(
               it->second.start_time_, it->second.response_timestamps_,
-              it->second.response_outputs_, it->second.sequence_end_,
-              it->second.delayed_, it->second.sequence_id_,
-              it->second.has_null_last_response_);
+              it->second.request_inputs_, it->second.response_outputs_,
+              it->second.sequence_end_, it->second.delayed_,
+              it->second.sequence_id_, it->second.has_null_last_response_);
           infer_backend_->ClientInferStat(&(thread_stat_->contexts_stat_[id_]));
           thread_stat_->cb_status_ = ValidateOutputs(result);
           async_req_map_.erase(request_id);
diff --git a/src/c++/perf_analyzer/infer_context.h b/src/c++/perf_analyzer/infer_context.h
@@ -185,7 +185,9 @@ class InferContext {
   std::function<void(uint32_t)> async_callback_finalize_func_ = nullptr;
 
  private:
-  const RequestRecord::ResponseOutput GetOutput(
+  const RequestRecord::RequestInput GetInputs();
+
+  const RequestRecord::ResponseOutput GetOutputs(
       const cb::InferResult& infer_result);
 
   const uint32_t id_{0};
diff --git a/src/c++/perf_analyzer/profile_data_exporter.cc b/src/c++/perf_analyzer/profile_data_exporter.cc
@@ -122,6 +122,11 @@ ProfileDataExporter::AddRequests(
       request.AddMember("sequence_id", sequence_id, document_.GetAllocator());
     }
 
+    rapidjson::Value request_inputs(rapidjson::kObjectType);
+    AddRequestInputs(request_inputs, raw_request.request_inputs_);
+    request.AddMember(
+        "request_inputs", request_inputs, document_.GetAllocator());
+
     rapidjson::Value response_timestamps(rapidjson::kArrayType);
     AddResponseTimestamps(
         response_timestamps, raw_request.response_timestamps_);
@@ -151,6 +156,45 @@ ProfileDataExporter::AddResponseTimestamps(
   }
 }
 
+void
+ProfileDataExporter::AddRequestInputs(
+    rapidjson::Value& request_inputs_json,
+    const std::vector<RequestRecord::RequestInput>& request_inputs)
+{
+  for (const auto& request_input : request_inputs) {
+    for (const auto& input : request_input) {
+      const auto& name{input.first};
+      const auto& buf{input.second.data_.get()};
+      const auto& byte_size{input.second.size_};
+      const auto& data_type{input.second.data_type_};
+      rapidjson::Value name_json(name.c_str(), document_.GetAllocator());
+      rapidjson::Value input_json{};
+      // TMA-1777: support other data types
+      if (buf != nullptr) {
+        if (data_type == "BYTES" || data_type == "JSON") {
+          input_json.SetString(
+              reinterpret_cast<const char*>(buf), byte_size,
+              document_.GetAllocator());
+        } else if (data_type == "INT32") {
+          auto* val = reinterpret_cast<int32_t*>(buf);
+          input_json.SetInt(*val);
+        } else if (data_type == "BOOL") {
+          bool is_true = (*buf > 0);
+          input_json.SetBool(is_true);
+        } else {
+          std::cerr << "WARNING: data type '" + data_type +
+                           "' is not supported with JSON."
+                    << std::endl;
+        }
+      } else {
+        input_json.SetString("", 0, document_.GetAllocator());
+      }
+      request_inputs_json.AddMember(
+          name_json, input_json, document_.GetAllocator());
+    }
+  }
+}
+
 void
 ProfileDataExporter::AddResponseOutputs(
     rapidjson::Value& outputs_json,
@@ -164,6 +208,7 @@ ProfileDataExporter::AddResponseOutputs(
       const auto& byte_size{output.second.size_};
       rapidjson::Value name_json(name.c_str(), document_.GetAllocator());
       rapidjson::Value output_json{};
+      // TMA-1777: support other data types
       if (buf != nullptr) {
         output_json.SetString(
             reinterpret_cast<const char*>(buf), byte_size,
diff --git a/src/c++/perf_analyzer/profile_data_exporter.h b/src/c++/perf_analyzer/profile_data_exporter.h
@@ -69,6 +69,9 @@ class ProfileDataExporter {
   void AddRequests(
       rapidjson::Value& entry, rapidjson::Value& requests,
       const Experiment& raw_experiment);
+  void AddRequestInputs(
+      rapidjson::Value& inputs_json,
+      const std::vector<RequestRecord::RequestInput>& inputs);
   void AddResponseTimestamps(
       rapidjson::Value& timestamps_json,
       const std::vector<std::chrono::time_point<std::chrono::system_clock>>&
diff --git a/src/c++/perf_analyzer/request_record.h b/src/c++/perf_analyzer/request_record.h
@@ -33,18 +33,19 @@
 
 namespace triton { namespace perfanalyzer {
 
-/// A record containing the data of a single response
-struct ResponseData {
-  ResponseData(const uint8_t* buf, size_t size)
+/// A record containing the data of a single request input or response output
+struct RecordData {
+  RecordData(const uint8_t* buf, size_t size, std::string data_type = "")
   {
     uint8_t* array = new uint8_t[size];
     std::memcpy(array, buf, size);
     data_ = std::shared_ptr<uint8_t>(array, [](uint8_t* p) { delete[] p; });
     size_ = size;
+    data_type_ = data_type;
   }
 
   // Define equality comparison operator so it can be inserted into maps
-  bool operator==(const ResponseData& other) const
+  bool operator==(const RecordData& other) const
   {
     if (size_ != other.size_)
       return false;
@@ -54,24 +55,28 @@ struct ResponseData {
 
   std::shared_ptr<uint8_t> data_;
   size_t size_;
+  std::string data_type_;
 };
 
 
 /// A record of an individual request
 struct RequestRecord {
-  using ResponseOutput = std::unordered_map<std::string, ResponseData>;
+  using RequestInput = std::unordered_map<std::string, RecordData>;
+  using ResponseOutput = std::unordered_map<std::string, RecordData>;
 
   RequestRecord(
       std::chrono::time_point<std::chrono::system_clock> start_time =
           std::chrono::time_point<std::chrono::system_clock>(),
       std::vector<std::chrono::time_point<std::chrono::system_clock>>
           response_timestamps = {},
+      std::vector<RequestInput> request_inputs = {},
       std::vector<ResponseOutput> response_outputs = {},
       bool sequence_end = true, bool delayed = false, uint64_t sequence_id = 0,
       bool has_null_last_response = false)
       : start_time_(start_time), response_timestamps_(response_timestamps),
-        response_outputs_(response_outputs), sequence_end_(sequence_end),
-        delayed_(delayed), sequence_id_(sequence_id),
+        request_inputs_(request_inputs), response_outputs_(response_outputs),
+        sequence_end_(sequence_end), delayed_(delayed),
+        sequence_id_(sequence_id),
         has_null_last_response_(has_null_last_response)
   {
   }
@@ -81,7 +86,7 @@ struct RequestRecord {
   std::vector<std::chrono::time_point<std::chrono::system_clock>>
       response_timestamps_;
 
-  // Collection of response outputs
+  std::vector<RequestInput> request_inputs_;
   std::vector<ResponseOutput> response_outputs_;
   // Whether or not the request is at the end of a sequence.
   bool sequence_end_;
diff --git a/src/c++/perf_analyzer/test_inference_profiler.cc b/src/c++/perf_analyzer/test_inference_profiler.cc
diff --git a/src/c++/perf_analyzer/test_load_manager.cc b/src/c++/perf_analyzer/test_load_manager.cc
diff --git a/src/c++/perf_analyzer/test_profile_data_collector.cc b/src/c++/perf_analyzer/test_profile_data_collector.cc
diff --git a/src/c++/perf_analyzer/test_profile_data_exporter.cc b/src/c++/perf_analyzer/test_profile_data_exporter.cc