Skip to content

Properly remove first 4 bytes of BYTES data #550

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Mar 27, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions src/c++/library/common.cc
Original file line number Diff line number Diff line change
Expand Up @@ -279,9 +279,9 @@ InferInput::GetNext(
Error
InferRequestedOutput::Create(
InferRequestedOutput** infer_output, const std::string& name,
const size_t class_count)
const size_t class_count, const std::string& datatype)
{
*infer_output = new InferRequestedOutput(name, class_count);
*infer_output = new InferRequestedOutput(name, datatype, class_count);
return Error::Success;
}

Expand Down Expand Up @@ -309,8 +309,10 @@ InferRequestedOutput::UnsetSharedMemory()
}

InferRequestedOutput::InferRequestedOutput(
const std::string& name, const size_t class_count)
: name_(name), class_count_(class_count), io_type_(NONE)
const std::string& name, const std::string& datatype,
const size_t class_count)
: name_(name), datatype_(datatype), class_count_(class_count),
io_type_(NONE)
{
}

Expand Down
6 changes: 4 additions & 2 deletions src/c++/library/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,7 @@ class InferRequestedOutput {
/// \return Error object indicating success or failure.
static Error Create(
InferRequestedOutput** infer_output, const std::string& name,
const size_t class_count = 0);
const size_t class_count = 0, const std::string& datatype = "");

/// Gets name of the associated output tensor.
/// \return The name of the tensor.
Expand Down Expand Up @@ -455,9 +455,11 @@ class InferRequestedOutput {
#endif

explicit InferRequestedOutput(
const std::string& name, const size_t class_count = 0);
const std::string& name, const std::string& datatype,
const size_t class_count = 0);

std::string name_;
std::string datatype_;
size_t class_count_;

// Used only if working with Shared Memory
Expand Down
16 changes: 9 additions & 7 deletions src/c++/perf_analyzer/client_backend/client_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -519,16 +519,17 @@ InferInput::InferInput(
Error
InferRequestedOutput::Create(
InferRequestedOutput** infer_output, const BackendKind kind,
const std::string& name, const size_t class_count)
const std::string& name, const std::string& datatype,
const size_t class_count)
{
if (kind == TRITON) {
RETURN_IF_CB_ERROR(tritonremote::TritonInferRequestedOutput::Create(
infer_output, name, class_count));
infer_output, name, class_count, datatype));
}
#ifdef TRITON_ENABLE_PERF_ANALYZER_OPENAI
else if (kind == OPENAI) {
RETURN_IF_CB_ERROR(
openai::OpenAiInferRequestedOutput::Create(infer_output, name));
RETURN_IF_CB_ERROR(openai::OpenAiInferRequestedOutput::Create(
infer_output, name, datatype));
}
#endif // TRITON_ENABLE_PERF_ANALYZER_OPENAI
#ifdef TRITON_ENABLE_PERF_ANALYZER_TFS
Expand All @@ -540,7 +541,7 @@ InferRequestedOutput::Create(
#ifdef TRITON_ENABLE_PERF_ANALYZER_C_API
else if (kind == TRITON_C_API) {
RETURN_IF_CB_ERROR(tritoncapi::TritonCApiInferRequestedOutput::Create(
infer_output, name, class_count));
infer_output, name, class_count, datatype));
}
#endif // TRITON_ENABLE_PERF_ANALYZER_C_API
else {
Expand All @@ -564,8 +565,9 @@ InferRequestedOutput::SetSharedMemory(
}

InferRequestedOutput::InferRequestedOutput(
const BackendKind kind, const std::string& name)
: kind_(kind), name_(name)
const BackendKind kind, const std::string& name,
const std::string& datatype)
: kind_(kind), name_(name), datatype_(datatype)
{
}

Expand Down
13 changes: 11 additions & 2 deletions src/c++/perf_analyzer/client_backend/client_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -581,18 +581,24 @@ class InferRequestedOutput {
/// \param infer_output Returns a new InferOutputGrpc object.
/// \param kind The kind of the associated client backend.
/// \param name The name of output being requested.
/// \param datatype The datatype of the output
/// \param class_count The number of classifications to be requested. The
/// default value is 0 which means the classification results are not
/// requested.
/// \return Error object indicating success or failure.
static Error Create(
InferRequestedOutput** infer_output, const BackendKind kind,
const std::string& name, const size_t class_count = 0);
const std::string& name, const std::string& datatype,
const size_t class_count = 0);

/// Gets name of the associated output tensor.
/// \return The name of the tensor.
const std::string& Name() const { return name_; }

/// Gets datatype of the associated output tensor.
/// \return The datatype of the tensor
const std::string& Datatype() const { return datatype_; }

/// Set the output tensor data to be written to specified shared
/// memory region.
/// \param region_name The name of the shared memory region.
Expand All @@ -605,9 +611,12 @@ class InferRequestedOutput {
const size_t offset = 0);

protected:
InferRequestedOutput(const BackendKind kind, const std::string& name);
InferRequestedOutput(
const BackendKind kind, const std::string& name,
const std::string& datatype);
const BackendKind kind_;
const std::string name_;
const std::string datatype_;
};

//
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,23 +83,25 @@ OpenAiClientBackend::ClientInferStat(InferStat* infer_stat)

Error
OpenAiInferRequestedOutput::Create(
InferRequestedOutput** infer_output, const std::string& name)
InferRequestedOutput** infer_output, const std::string& name,
const std::string& datatype)
{
OpenAiInferRequestedOutput* local_infer_output =
new OpenAiInferRequestedOutput(name);
new OpenAiInferRequestedOutput(name, datatype);

tc::InferRequestedOutput* openai_infer_output;
RETURN_IF_TRITON_ERROR(
tc::InferRequestedOutput::Create(&openai_infer_output, name));
RETURN_IF_TRITON_ERROR(tc::InferRequestedOutput::Create(
&openai_infer_output, name, 0, datatype));
local_infer_output->output_.reset(openai_infer_output);

*infer_output = local_infer_output;

return Error::Success;
}

OpenAiInferRequestedOutput::OpenAiInferRequestedOutput(const std::string& name)
: InferRequestedOutput(BackendKind::OPENAI, name)
OpenAiInferRequestedOutput::OpenAiInferRequestedOutput(
const std::string& name, const std::string& datatype)
: InferRequestedOutput(BackendKind::OPENAI, name, datatype)
{
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,13 +95,15 @@ class OpenAiClientBackend : public ClientBackend {
class OpenAiInferRequestedOutput : public InferRequestedOutput {
public:
static Error Create(
InferRequestedOutput** infer_output, const std::string& name);
InferRequestedOutput** infer_output, const std::string& name,
const std::string& datatype);
/// Returns the raw InferRequestedOutput object required by OpenAi client
/// library.
tc::InferRequestedOutput* Get() const { return output_.get(); }

private:
explicit OpenAiInferRequestedOutput(const std::string& name);
explicit OpenAiInferRequestedOutput(
const std::string& name, const std::string& datatype);

std::unique_ptr<tc::InferRequestedOutput> output_;
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -768,14 +768,14 @@ TritonInferInput::TritonInferInput(
Error
TritonInferRequestedOutput::Create(
InferRequestedOutput** infer_output, const std::string& name,
const size_t class_count)
const size_t class_count, const std::string& datatype)
{
TritonInferRequestedOutput* local_infer_output =
new TritonInferRequestedOutput(name);
new TritonInferRequestedOutput(name, datatype);

tc::InferRequestedOutput* triton_infer_output;
RETURN_IF_TRITON_ERROR(tc::InferRequestedOutput::Create(
&triton_infer_output, name, class_count));
&triton_infer_output, name, class_count, datatype));
local_infer_output->output_.reset(triton_infer_output);

*infer_output = local_infer_output;
Expand All @@ -793,8 +793,9 @@ TritonInferRequestedOutput::SetSharedMemory(
}


TritonInferRequestedOutput::TritonInferRequestedOutput(const std::string& name)
: InferRequestedOutput(BackendKind::TRITON, name)
TritonInferRequestedOutput::TritonInferRequestedOutput(
const std::string& name, const std::string& datatype)
: InferRequestedOutput(BackendKind::TRITON, name, datatype)
{
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ class TritonInferRequestedOutput : public InferRequestedOutput {
public:
static Error Create(
InferRequestedOutput** infer_output, const std::string& name,
const size_t class_count = 0);
const size_t class_count = 0, const std::string& datatype = "");
/// Returns the raw InferRequestedOutput object required by triton client
/// library.
tc::InferRequestedOutput* Get() const { return output_.get(); }
Expand All @@ -309,7 +309,8 @@ class TritonInferRequestedOutput : public InferRequestedOutput {
const size_t offset = 0) override;

private:
explicit TritonInferRequestedOutput(const std::string& name);
explicit TritonInferRequestedOutput(
const std::string& name, const std::string& datatype);

std::unique_ptr<tc::InferRequestedOutput> output_;
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -335,14 +335,14 @@ TritonCApiInferInput::TritonCApiInferInput(
Error
TritonCApiInferRequestedOutput::Create(
InferRequestedOutput** infer_output, const std::string& name,
const size_t class_count)
const size_t class_count, const std::string& datatype)
{
TritonCApiInferRequestedOutput* local_infer_output =
new TritonCApiInferRequestedOutput(name);

tc::InferRequestedOutput* triton_infer_output;
RETURN_IF_TRITON_ERROR(tc::InferRequestedOutput::Create(
&triton_infer_output, name, class_count));
&triton_infer_output, name, class_count, datatype));
local_infer_output->output_.reset(triton_infer_output);

*infer_output = local_infer_output;
Expand Down
19 changes: 1 addition & 18 deletions src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,28 +460,11 @@ def _parse_requests(self, requests: dict) -> LLMMetrics:
num_generated_tokens,
)

def _remove_leading_invalid_chars(self, text: str):
if len(text) < 4:
return text

for i, char in enumerate(text):
# There will be 3 or 4 chars
# (but sometimes the first char looks valid, so don't stop until we've seen at least 3)
if char.isprintable() and i > 2:
break

return text[i:]

def _preprocess_response(
self, res_timestamps: list[int], res_outputs: list[dict[str, str]]
) -> None:
"""Helper function to preprocess responses of a request."""
# FIXME -- remove this triton code once it is properly fixed in PA
# (PA/triton will add junk to the start of the BYTES array. Remove it here)
if self._service_kind == "triton":
for d in res_outputs:
d["text_output"] = self._remove_leading_invalid_chars(d["text_output"])
elif self._service_kind == "openai":
if self._service_kind == "openai":
# remove the null final response in streaming mode
last_response = res_outputs[-1]["response"]
last_response = remove_sse_prefix(last_response)
Expand Down
30 changes: 13 additions & 17 deletions src/c++/perf_analyzer/genai-perf/tests/test_llm_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,23 +385,19 @@ def test_llm_metrics_get_base_name(self) -> None:
{
"timestamp": 1,
"response_timestamps": [3, 5, 8],
# FIXME - remove the whitespace once PA handles it.
# LLMProfileDataParser preprocessse the responses
# from triton server and removes first few chars.
# Add whitespace to avoid valid chars being removed.
"response_outputs": [
{"text_output": " dogs"},
{"text_output": " are"},
{"text_output": " cool"},
{"text_output": "dogs"},
{"text_output": "are"},
{"text_output": "cool"},
],
},
{
"timestamp": 2,
"response_timestamps": [4, 7, 11],
"response_outputs": [
{"text_output": " I"},
{"text_output": " don't"},
{"text_output": " cook food"},
{"text_output": "I"},
{"text_output": "don't"},
{"text_output": "cook food"},
],
},
],
Expand All @@ -416,19 +412,19 @@ def test_llm_metrics_get_base_name(self) -> None:
"timestamp": 5,
"response_timestamps": [7, 8, 13, 18],
"response_outputs": [
{"text_output": " cats"},
{"text_output": " are"},
{"text_output": " cool"},
{"text_output": " too"},
{"text_output": "cats"},
{"text_output": "are"},
{"text_output": "cool"},
{"text_output": "too"},
],
},
{
"timestamp": 3,
"response_timestamps": [6, 8, 11],
"response_outputs": [
{"text_output": " it's"},
{"text_output": " very"},
{"text_output": " simple work"},
{"text_output": "it's"},
{"text_output": "very"},
{"text_output": "simple work"},
],
},
],
Expand Down
8 changes: 8 additions & 0 deletions src/c++/perf_analyzer/infer_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,14 @@ InferContext::GetOutput(const cb::InferResult& infer_result)
const uint8_t* buf{nullptr};
size_t byte_size{0};
infer_result.RawData(requested_output->Name(), &buf, &byte_size);

// The first 4 bytes of BYTES data is a 32-bit integer to indicate the size
// of the rest of the data (which we already know based on byte_size). It
// should be ignored here, as it isn't part of the actual response
if (requested_output->Datatype() == "BYTES") {
buf += 4;
byte_size -= 4;
}
output.emplace(requested_output->Name(), ResponseData(buf, byte_size));
}
return output;
Expand Down
7 changes: 4 additions & 3 deletions src/c++/perf_analyzer/infer_data_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -175,11 +175,12 @@ InferDataManager::InitInferDataInput(

cb::Error
InferDataManager::InitInferDataOutput(
const std::string& name, InferData& infer_data)
const std::string& name, const ModelTensor& model_tensor,
InferData& infer_data)
{
cb::InferRequestedOutput* requested_output;
RETURN_IF_ERROR(
cb::InferRequestedOutput::Create(&requested_output, backend_kind_, name));
RETURN_IF_ERROR(cb::InferRequestedOutput::Create(
&requested_output, backend_kind_, name, model_tensor.datatype_));
infer_data.outputs_.push_back(requested_output);

return cb::Error::Success;
Expand Down
3 changes: 2 additions & 1 deletion src/c++/perf_analyzer/infer_data_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ class InferDataManager : public InferDataManagerBase {
InferData& infer_data) override;

cb::Error InitInferDataOutput(
const std::string& name, InferData& infer_data) override;
const std::string& name, const ModelTensor& model_tensor,
InferData& infer_data) override;

/// Helper function to update the inputs
/// \param thread_id The ID of the calling thread
Expand Down
3 changes: 2 additions & 1 deletion src/c++/perf_analyzer/infer_data_manager_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,8 @@ InferDataManagerBase::InitInferData(InferData& infer_data)
}

for (const auto& output : *(parser_->Outputs())) {
RETURN_IF_ERROR(InitInferDataOutput(output.first, infer_data));
RETURN_IF_ERROR(
InitInferDataOutput(output.first, output.second, infer_data));
}

return cb::Error::Success;
Expand Down
3 changes: 2 additions & 1 deletion src/c++/perf_analyzer/infer_data_manager_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,8 @@ class InferDataManagerBase : public IInferDataManager {
InferData& infer_data) = 0;

virtual cb::Error InitInferDataOutput(
const std::string& name, InferData& infer_data) = 0;
const std::string& name, const ModelTensor& model_tensor,
InferData& infer_data) = 0;

void AddInferDataParameters(InferData& infer_data);

Expand Down
Loading
Loading