triton-inference-server · tgerdesnv · Mar 27, 2024 · Mar 26, 2024 · Mar 26, 2024 · Mar 26, 2024
diff --git a/src/c++/library/common.cc b/src/c++/library/common.cc
@@ -279,9 +279,9 @@ InferInput::GetNext(
 Error
 InferRequestedOutput::Create(
     InferRequestedOutput** infer_output, const std::string& name,
-    const size_t class_count)
+    const size_t class_count, const std::string& datatype)
 {
-  *infer_output = new InferRequestedOutput(name, class_count);
+  *infer_output = new InferRequestedOutput(name, datatype, class_count);
   return Error::Success;
 }
 
@@ -309,8 +309,10 @@ InferRequestedOutput::UnsetSharedMemory()
 }
 
 InferRequestedOutput::InferRequestedOutput(
-    const std::string& name, const size_t class_count)
-    : name_(name), class_count_(class_count), io_type_(NONE)
+    const std::string& name, const std::string& datatype,
+    const size_t class_count)
+    : name_(name), datatype_(datatype), class_count_(class_count),
+      io_type_(NONE)
 {
 }
 

diff --git a/src/c++/library/common.h b/src/c++/library/common.h
@@ -400,7 +400,7 @@ class InferRequestedOutput {
   /// \return Error object indicating success or failure.
   static Error Create(
       InferRequestedOutput** infer_output, const std::string& name,
-      const size_t class_count = 0);
+      const size_t class_count = 0, const std::string& datatype = "");
 
   /// Gets name of the associated output tensor.
   /// \return The name of the tensor.
@@ -455,9 +455,11 @@ class InferRequestedOutput {
 #endif
 
   explicit InferRequestedOutput(
-      const std::string& name, const size_t class_count = 0);
+      const std::string& name, const std::string& datatype,
+      const size_t class_count = 0);
 
   std::string name_;
+  std::string datatype_;
   size_t class_count_;
 
   // Used only if working with Shared Memory

diff --git a/src/c++/perf_analyzer/client_backend/client_backend.cc b/src/c++/perf_analyzer/client_backend/client_backend.cc
@@ -519,16 +519,17 @@ InferInput::InferInput(
 Error
 InferRequestedOutput::Create(
     InferRequestedOutput** infer_output, const BackendKind kind,
-    const std::string& name, const size_t class_count)
+    const std::string& name, const std::string& datatype,
+    const size_t class_count)
 {
   if (kind == TRITON) {
     RETURN_IF_CB_ERROR(tritonremote::TritonInferRequestedOutput::Create(
-        infer_output, name, class_count));
+        infer_output, name, class_count, datatype));
   }
 #ifdef TRITON_ENABLE_PERF_ANALYZER_OPENAI
   else if (kind == OPENAI) {
-    RETURN_IF_CB_ERROR(
-        openai::OpenAiInferRequestedOutput::Create(infer_output, name));
+    RETURN_IF_CB_ERROR(openai::OpenAiInferRequestedOutput::Create(
+        infer_output, name, datatype));
   }
 #endif  // TRITON_ENABLE_PERF_ANALYZER_OPENAI
 #ifdef TRITON_ENABLE_PERF_ANALYZER_TFS
@@ -540,7 +541,7 @@ InferRequestedOutput::Create(
 #ifdef TRITON_ENABLE_PERF_ANALYZER_C_API
   else if (kind == TRITON_C_API) {
     RETURN_IF_CB_ERROR(tritoncapi::TritonCApiInferRequestedOutput::Create(
-        infer_output, name, class_count));
+        infer_output, name, class_count, datatype));
   }
 #endif  // TRITON_ENABLE_PERF_ANALYZER_C_API
   else {
@@ -564,8 +565,9 @@ InferRequestedOutput::SetSharedMemory(
 }
 
 InferRequestedOutput::InferRequestedOutput(
-    const BackendKind kind, const std::string& name)
-    : kind_(kind), name_(name)
+    const BackendKind kind, const std::string& name,
+    const std::string& datatype)
+    : kind_(kind), name_(name), datatype_(datatype)
 {
 }
 

diff --git a/src/c++/perf_analyzer/client_backend/client_backend.h b/src/c++/perf_analyzer/client_backend/client_backend.h
@@ -581,18 +581,24 @@ class InferRequestedOutput {
   /// \param infer_output Returns a new InferOutputGrpc object.
   /// \param kind The kind of the associated client backend.
   /// \param name The name of output being requested.
+  /// \param datatype The datatype of the output
   /// \param class_count The number of classifications to be requested. The
   /// default value is 0 which means the classification results are not
   /// requested.
   /// \return Error object indicating success or failure.
   static Error Create(
       InferRequestedOutput** infer_output, const BackendKind kind,
-      const std::string& name, const size_t class_count = 0);
+      const std::string& name, const std::string& datatype,
+      const size_t class_count = 0);
 
   /// Gets name of the associated output tensor.
   /// \return The name of the tensor.
   const std::string& Name() const { return name_; }
 
+  /// Gets datatype of the associated output tensor.
+  /// \return The datatype of the tensor
+  const std::string& Datatype() const { return datatype_; }
+
   /// Set the output tensor data to be written to specified shared
   /// memory region.
   /// \param region_name The name of the shared memory region.
@@ -605,9 +611,12 @@ class InferRequestedOutput {
       const size_t offset = 0);
 
  protected:
-  InferRequestedOutput(const BackendKind kind, const std::string& name);
+  InferRequestedOutput(
+      const BackendKind kind, const std::string& name,
+      const std::string& datatype);
   const BackendKind kind_;
   const std::string name_;
+  const std::string datatype_;
 };
 
 //

diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.cc b/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.cc
@@ -83,23 +83,25 @@ OpenAiClientBackend::ClientInferStat(InferStat* infer_stat)
 
 Error
 OpenAiInferRequestedOutput::Create(
-    InferRequestedOutput** infer_output, const std::string& name)
+    InferRequestedOutput** infer_output, const std::string& name,
+    const std::string& datatype)
 {
   OpenAiInferRequestedOutput* local_infer_output =
-      new OpenAiInferRequestedOutput(name);
+      new OpenAiInferRequestedOutput(name, datatype);
 
   tc::InferRequestedOutput* openai_infer_output;
-  RETURN_IF_TRITON_ERROR(
-      tc::InferRequestedOutput::Create(&openai_infer_output, name));
+  RETURN_IF_TRITON_ERROR(tc::InferRequestedOutput::Create(
+      &openai_infer_output, name, 0, datatype));
   local_infer_output->output_.reset(openai_infer_output);
 
   *infer_output = local_infer_output;
 
   return Error::Success;
 }
 
-OpenAiInferRequestedOutput::OpenAiInferRequestedOutput(const std::string& name)
-    : InferRequestedOutput(BackendKind::OPENAI, name)
+OpenAiInferRequestedOutput::OpenAiInferRequestedOutput(
+    const std::string& name, const std::string& datatype)
+    : InferRequestedOutput(BackendKind::OPENAI, name, datatype)
 {
 }
 

diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.h b/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.h
@@ -95,13 +95,15 @@ class OpenAiClientBackend : public ClientBackend {
 class OpenAiInferRequestedOutput : public InferRequestedOutput {
  public:
   static Error Create(
-      InferRequestedOutput** infer_output, const std::string& name);
+      InferRequestedOutput** infer_output, const std::string& name,
+      const std::string& datatype);
   /// Returns the raw InferRequestedOutput object required by OpenAi client
   /// library.
   tc::InferRequestedOutput* Get() const { return output_.get(); }
 
  private:
-  explicit OpenAiInferRequestedOutput(const std::string& name);
+  explicit OpenAiInferRequestedOutput(
+      const std::string& name, const std::string& datatype);
 
   std::unique_ptr<tc::InferRequestedOutput> output_;
 };

diff --git a/src/c++/perf_analyzer/client_backend/triton/triton_client_backend.cc b/src/c++/perf_analyzer/client_backend/triton/triton_client_backend.cc
@@ -768,14 +768,14 @@ TritonInferInput::TritonInferInput(
 Error
 TritonInferRequestedOutput::Create(
     InferRequestedOutput** infer_output, const std::string& name,
-    const size_t class_count)
+    const size_t class_count, const std::string& datatype)
 {
   TritonInferRequestedOutput* local_infer_output =
-      new TritonInferRequestedOutput(name);
+      new TritonInferRequestedOutput(name, datatype);
 
   tc::InferRequestedOutput* triton_infer_output;
   RETURN_IF_TRITON_ERROR(tc::InferRequestedOutput::Create(
-      &triton_infer_output, name, class_count));
+      &triton_infer_output, name, class_count, datatype));
   local_infer_output->output_.reset(triton_infer_output);
 
   *infer_output = local_infer_output;
@@ -793,8 +793,9 @@ TritonInferRequestedOutput::SetSharedMemory(
 }
 
 
-TritonInferRequestedOutput::TritonInferRequestedOutput(const std::string& name)
-    : InferRequestedOutput(BackendKind::TRITON, name)
+TritonInferRequestedOutput::TritonInferRequestedOutput(
+    const std::string& name, const std::string& datatype)
+    : InferRequestedOutput(BackendKind::TRITON, name, datatype)
 {
 }
 

diff --git a/src/c++/perf_analyzer/client_backend/triton/triton_client_backend.h b/src/c++/perf_analyzer/client_backend/triton/triton_client_backend.h
@@ -299,7 +299,7 @@ class TritonInferRequestedOutput : public InferRequestedOutput {
  public:
   static Error Create(
       InferRequestedOutput** infer_output, const std::string& name,
-      const size_t class_count = 0);
+      const size_t class_count = 0, const std::string& datatype = "");
   /// Returns the raw InferRequestedOutput object required by triton client
   /// library.
   tc::InferRequestedOutput* Get() const { return output_.get(); }
@@ -309,7 +309,8 @@ class TritonInferRequestedOutput : public InferRequestedOutput {
       const size_t offset = 0) override;
 
  private:
-  explicit TritonInferRequestedOutput(const std::string& name);
+  explicit TritonInferRequestedOutput(
+      const std::string& name, const std::string& datatype);
 
   std::unique_ptr<tc::InferRequestedOutput> output_;
 };

diff --git a/src/c++/perf_analyzer/client_backend/triton_c_api/triton_c_api_backend.cc b/src/c++/perf_analyzer/client_backend/triton_c_api/triton_c_api_backend.cc
@@ -335,14 +335,14 @@ TritonCApiInferInput::TritonCApiInferInput(
 Error
 TritonCApiInferRequestedOutput::Create(
     InferRequestedOutput** infer_output, const std::string& name,
-    const size_t class_count)
+    const size_t class_count, const std::string& datatype)
 {
   TritonCApiInferRequestedOutput* local_infer_output =
       new TritonCApiInferRequestedOutput(name);
 
   tc::InferRequestedOutput* triton_infer_output;
   RETURN_IF_TRITON_ERROR(tc::InferRequestedOutput::Create(
-      &triton_infer_output, name, class_count));
+      &triton_infer_output, name, class_count, datatype));
   local_infer_output->output_.reset(triton_infer_output);
 
   *infer_output = local_infer_output;

diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py
@@ -460,28 +460,11 @@ def _parse_requests(self, requests: dict) -> LLMMetrics:
             num_generated_tokens,
         )
 
-    def _remove_leading_invalid_chars(self, text: str):
-        if len(text) < 4:
-            return text
-
-        for i, char in enumerate(text):
-            # There will be 3 or 4 chars
-            # (but sometimes the first char looks valid, so don't stop until we've seen at least 3)
-            if char.isprintable() and i > 2:
-                break
-
-        return text[i:]
-
     def _preprocess_response(
         self, res_timestamps: list[int], res_outputs: list[dict[str, str]]
     ) -> None:
         """Helper function to preprocess responses of a request."""
-        # FIXME -- remove this triton code once it is properly fixed in PA
-        # (PA/triton will add junk to the start of the BYTES array. Remove it here)
-        if self._service_kind == "triton":
-            for d in res_outputs:
-                d["text_output"] = self._remove_leading_invalid_chars(d["text_output"])
-        elif self._service_kind == "openai":
+        if self._service_kind == "openai":
             # remove the null final response in streaming mode
             last_response = res_outputs[-1]["response"]
             last_response = remove_sse_prefix(last_response)

diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_llm_metrics.py b/src/c++/perf_analyzer/genai-perf/tests/test_llm_metrics.py
@@ -385,23 +385,19 @@ def test_llm_metrics_get_base_name(self) -> None:
                     {
                         "timestamp": 1,
                         "response_timestamps": [3, 5, 8],
-                        # FIXME - remove the whitespace once PA handles it.
-                        # LLMProfileDataParser preprocessse the responses
-                        # from triton server and removes first few chars.
-                        # Add whitespace to avoid valid chars being removed.
                         "response_outputs": [
-                            {"text_output": "   dogs"},
-                            {"text_output": "   are"},
-                            {"text_output": "   cool"},
+                            {"text_output": "dogs"},
+                            {"text_output": "are"},
+                            {"text_output": "cool"},
                         ],
                     },
                     {
                         "timestamp": 2,
                         "response_timestamps": [4, 7, 11],
                         "response_outputs": [
-                            {"text_output": "   I"},
-                            {"text_output": "   don't"},
-                            {"text_output": "   cook food"},
+                            {"text_output": "I"},
+                            {"text_output": "don't"},
+                            {"text_output": "cook food"},
                         ],
                     },
                 ],
@@ -416,19 +412,19 @@ def test_llm_metrics_get_base_name(self) -> None:
                         "timestamp": 5,
                         "response_timestamps": [7, 8, 13, 18],
                         "response_outputs": [
-                            {"text_output": "   cats"},
-                            {"text_output": "   are"},
-                            {"text_output": "   cool"},
-                            {"text_output": "   too"},
+                            {"text_output": "cats"},
+                            {"text_output": "are"},
+                            {"text_output": "cool"},
+                            {"text_output": "too"},
                         ],
                     },
                     {
                         "timestamp": 3,
                         "response_timestamps": [6, 8, 11],
                         "response_outputs": [
-                            {"text_output": "   it's"},
-                            {"text_output": "   very"},
-                            {"text_output": "   simple work"},
+                            {"text_output": "it's"},
+                            {"text_output": "very"},
+                            {"text_output": "simple work"},
                         ],
                     },
                 ],

diff --git a/src/c++/perf_analyzer/infer_context.cc b/src/c++/perf_analyzer/infer_context.cc
@@ -188,6 +188,14 @@ InferContext::GetOutput(const cb::InferResult& infer_result)
     const uint8_t* buf{nullptr};
     size_t byte_size{0};
     infer_result.RawData(requested_output->Name(), &buf, &byte_size);
+
+    // The first 4 bytes of BYTES data is a 32-bit integer to indicate the size
+    // of the rest of the data (which we already know based on byte_size). It
+    // should be ignored here, as it isn't part of the actual response
+    if (requested_output->Datatype() == "BYTES") {
+      buf += 4;
+      byte_size -= 4;
+    }
     output.emplace(requested_output->Name(), ResponseData(buf, byte_size));
   }
   return output;

diff --git a/src/c++/perf_analyzer/infer_data_manager.cc b/src/c++/perf_analyzer/infer_data_manager.cc
@@ -175,11 +175,12 @@ InferDataManager::InitInferDataInput(
 
 cb::Error
 InferDataManager::InitInferDataOutput(
-    const std::string& name, InferData& infer_data)
+    const std::string& name, const ModelTensor& model_tensor,
+    InferData& infer_data)
 {
   cb::InferRequestedOutput* requested_output;
-  RETURN_IF_ERROR(
-      cb::InferRequestedOutput::Create(&requested_output, backend_kind_, name));
+  RETURN_IF_ERROR(cb::InferRequestedOutput::Create(
+      &requested_output, backend_kind_, name, model_tensor.datatype_));
   infer_data.outputs_.push_back(requested_output);
 
   return cb::Error::Success;

diff --git a/src/c++/perf_analyzer/infer_data_manager.h b/src/c++/perf_analyzer/infer_data_manager.h
@@ -74,7 +74,8 @@ class InferDataManager : public InferDataManagerBase {
       InferData& infer_data) override;
 
   cb::Error InitInferDataOutput(
-      const std::string& name, InferData& infer_data) override;
+      const std::string& name, const ModelTensor& model_tensor,
+      InferData& infer_data) override;
 
   /// Helper function to update the inputs
   /// \param thread_id The ID of the calling thread

diff --git a/src/c++/perf_analyzer/infer_data_manager_base.cc b/src/c++/perf_analyzer/infer_data_manager_base.cc
@@ -115,7 +115,8 @@ InferDataManagerBase::InitInferData(InferData& infer_data)
   }
 
   for (const auto& output : *(parser_->Outputs())) {
-    RETURN_IF_ERROR(InitInferDataOutput(output.first, infer_data));
+    RETURN_IF_ERROR(
+        InitInferDataOutput(output.first, output.second, infer_data));
   }
 
   return cb::Error::Success;

diff --git a/src/c++/perf_analyzer/infer_data_manager_base.h b/src/c++/perf_analyzer/infer_data_manager_base.h
@@ -138,7 +138,8 @@ class InferDataManagerBase : public IInferDataManager {
       InferData& infer_data) = 0;
 
   virtual cb::Error InitInferDataOutput(
-      const std::string& name, InferData& infer_data) = 0;
+      const std::string& name, const ModelTensor& model_tensor,
+      InferData& infer_data) = 0;
 
   void AddInferDataParameters(InferData& infer_data);