Skip to content

Commit 77ae111

Browse files
tgerdesnvmc-nv
authored andcommitted
Properly remove first 4 bytes of BYTES data (#550)
* Add plumbing to get output datatype * ignore first 4 bytes of BYTES data for gathering output * Update py code to no longer remove leading bytes * remove workaround in tests * Fix order of args * more fixes * handle case where bytes response is empty
1 parent 055c6b5 commit 77ae111

19 files changed

+92
-78
lines changed

src/c++/library/common.cc

+6-4
Original file line numberDiff line numberDiff line change
@@ -279,9 +279,9 @@ InferInput::GetNext(
279279
Error
280280
InferRequestedOutput::Create(
281281
InferRequestedOutput** infer_output, const std::string& name,
282-
const size_t class_count)
282+
const size_t class_count, const std::string& datatype)
283283
{
284-
*infer_output = new InferRequestedOutput(name, class_count);
284+
*infer_output = new InferRequestedOutput(name, datatype, class_count);
285285
return Error::Success;
286286
}
287287

@@ -309,8 +309,10 @@ InferRequestedOutput::UnsetSharedMemory()
309309
}
310310

311311
InferRequestedOutput::InferRequestedOutput(
312-
const std::string& name, const size_t class_count)
313-
: name_(name), class_count_(class_count), io_type_(NONE)
312+
const std::string& name, const std::string& datatype,
313+
const size_t class_count)
314+
: name_(name), datatype_(datatype), class_count_(class_count),
315+
io_type_(NONE)
314316
{
315317
}
316318

src/c++/library/common.h

+4-2
Original file line numberDiff line numberDiff line change
@@ -400,7 +400,7 @@ class InferRequestedOutput {
400400
/// \return Error object indicating success or failure.
401401
static Error Create(
402402
InferRequestedOutput** infer_output, const std::string& name,
403-
const size_t class_count = 0);
403+
const size_t class_count = 0, const std::string& datatype = "");
404404

405405
/// Gets name of the associated output tensor.
406406
/// \return The name of the tensor.
@@ -455,9 +455,11 @@ class InferRequestedOutput {
455455
#endif
456456

457457
explicit InferRequestedOutput(
458-
const std::string& name, const size_t class_count = 0);
458+
const std::string& name, const std::string& datatype,
459+
const size_t class_count = 0);
459460

460461
std::string name_;
462+
std::string datatype_;
461463
size_t class_count_;
462464

463465
// Used only if working with Shared Memory

src/c++/perf_analyzer/client_backend/client_backend.cc

+9-7
Original file line numberDiff line numberDiff line change
@@ -519,16 +519,17 @@ InferInput::InferInput(
519519
Error
520520
InferRequestedOutput::Create(
521521
InferRequestedOutput** infer_output, const BackendKind kind,
522-
const std::string& name, const size_t class_count)
522+
const std::string& name, const std::string& datatype,
523+
const size_t class_count)
523524
{
524525
if (kind == TRITON) {
525526
RETURN_IF_CB_ERROR(tritonremote::TritonInferRequestedOutput::Create(
526-
infer_output, name, class_count));
527+
infer_output, name, class_count, datatype));
527528
}
528529
#ifdef TRITON_ENABLE_PERF_ANALYZER_OPENAI
529530
else if (kind == OPENAI) {
530-
RETURN_IF_CB_ERROR(
531-
openai::OpenAiInferRequestedOutput::Create(infer_output, name));
531+
RETURN_IF_CB_ERROR(openai::OpenAiInferRequestedOutput::Create(
532+
infer_output, name, datatype));
532533
}
533534
#endif // TRITON_ENABLE_PERF_ANALYZER_OPENAI
534535
#ifdef TRITON_ENABLE_PERF_ANALYZER_TFS
@@ -540,7 +541,7 @@ InferRequestedOutput::Create(
540541
#ifdef TRITON_ENABLE_PERF_ANALYZER_C_API
541542
else if (kind == TRITON_C_API) {
542543
RETURN_IF_CB_ERROR(tritoncapi::TritonCApiInferRequestedOutput::Create(
543-
infer_output, name, class_count));
544+
infer_output, name, class_count, datatype));
544545
}
545546
#endif // TRITON_ENABLE_PERF_ANALYZER_C_API
546547
else {
@@ -564,8 +565,9 @@ InferRequestedOutput::SetSharedMemory(
564565
}
565566

566567
InferRequestedOutput::InferRequestedOutput(
567-
const BackendKind kind, const std::string& name)
568-
: kind_(kind), name_(name)
568+
const BackendKind kind, const std::string& name,
569+
const std::string& datatype)
570+
: kind_(kind), name_(name), datatype_(datatype)
569571
{
570572
}
571573

src/c++/perf_analyzer/client_backend/client_backend.h

+11-2
Original file line numberDiff line numberDiff line change
@@ -581,18 +581,24 @@ class InferRequestedOutput {
581581
/// \param infer_output Returns a new InferOutputGrpc object.
582582
/// \param kind The kind of the associated client backend.
583583
/// \param name The name of output being requested.
584+
/// \param datatype The datatype of the output
584585
/// \param class_count The number of classifications to be requested. The
585586
/// default value is 0 which means the classification results are not
586587
/// requested.
587588
/// \return Error object indicating success or failure.
588589
static Error Create(
589590
InferRequestedOutput** infer_output, const BackendKind kind,
590-
const std::string& name, const size_t class_count = 0);
591+
const std::string& name, const std::string& datatype,
592+
const size_t class_count = 0);
591593

592594
/// Gets name of the associated output tensor.
593595
/// \return The name of the tensor.
594596
const std::string& Name() const { return name_; }
595597

598+
/// Gets datatype of the associated output tensor.
599+
/// \return The datatype of the tensor
600+
const std::string& Datatype() const { return datatype_; }
601+
596602
/// Set the output tensor data to be written to specified shared
597603
/// memory region.
598604
/// \param region_name The name of the shared memory region.
@@ -605,9 +611,12 @@ class InferRequestedOutput {
605611
const size_t offset = 0);
606612

607613
protected:
608-
InferRequestedOutput(const BackendKind kind, const std::string& name);
614+
InferRequestedOutput(
615+
const BackendKind kind, const std::string& name,
616+
const std::string& datatype = "");
609617
const BackendKind kind_;
610618
const std::string name_;
619+
const std::string datatype_;
611620
};
612621

613622
//

src/c++/perf_analyzer/client_backend/openai/openai_client_backend.cc

+8-6
Original file line numberDiff line numberDiff line change
@@ -83,23 +83,25 @@ OpenAiClientBackend::ClientInferStat(InferStat* infer_stat)
8383

8484
Error
8585
OpenAiInferRequestedOutput::Create(
86-
InferRequestedOutput** infer_output, const std::string& name)
86+
InferRequestedOutput** infer_output, const std::string& name,
87+
const std::string& datatype)
8788
{
8889
OpenAiInferRequestedOutput* local_infer_output =
89-
new OpenAiInferRequestedOutput(name);
90+
new OpenAiInferRequestedOutput(name, datatype);
9091

9192
tc::InferRequestedOutput* openai_infer_output;
92-
RETURN_IF_TRITON_ERROR(
93-
tc::InferRequestedOutput::Create(&openai_infer_output, name));
93+
RETURN_IF_TRITON_ERROR(tc::InferRequestedOutput::Create(
94+
&openai_infer_output, name, 0, datatype));
9495
local_infer_output->output_.reset(openai_infer_output);
9596

9697
*infer_output = local_infer_output;
9798

9899
return Error::Success;
99100
}
100101

101-
OpenAiInferRequestedOutput::OpenAiInferRequestedOutput(const std::string& name)
102-
: InferRequestedOutput(BackendKind::OPENAI, name)
102+
OpenAiInferRequestedOutput::OpenAiInferRequestedOutput(
103+
const std::string& name, const std::string& datatype)
104+
: InferRequestedOutput(BackendKind::OPENAI, name, datatype)
103105
{
104106
}
105107

src/c++/perf_analyzer/client_backend/openai/openai_client_backend.h

+4-2
Original file line numberDiff line numberDiff line change
@@ -95,13 +95,15 @@ class OpenAiClientBackend : public ClientBackend {
9595
class OpenAiInferRequestedOutput : public InferRequestedOutput {
9696
public:
9797
static Error Create(
98-
InferRequestedOutput** infer_output, const std::string& name);
98+
InferRequestedOutput** infer_output, const std::string& name,
99+
const std::string& datatype);
99100
/// Returns the raw InferRequestedOutput object required by OpenAi client
100101
/// library.
101102
tc::InferRequestedOutput* Get() const { return output_.get(); }
102103

103104
private:
104-
explicit OpenAiInferRequestedOutput(const std::string& name);
105+
explicit OpenAiInferRequestedOutput(
106+
const std::string& name, const std::string& datatype);
105107

106108
std::unique_ptr<tc::InferRequestedOutput> output_;
107109
};

src/c++/perf_analyzer/client_backend/triton/triton_client_backend.cc

+6-5
Original file line numberDiff line numberDiff line change
@@ -768,14 +768,14 @@ TritonInferInput::TritonInferInput(
768768
Error
769769
TritonInferRequestedOutput::Create(
770770
InferRequestedOutput** infer_output, const std::string& name,
771-
const size_t class_count)
771+
const size_t class_count, const std::string& datatype)
772772
{
773773
TritonInferRequestedOutput* local_infer_output =
774-
new TritonInferRequestedOutput(name);
774+
new TritonInferRequestedOutput(name, datatype);
775775

776776
tc::InferRequestedOutput* triton_infer_output;
777777
RETURN_IF_TRITON_ERROR(tc::InferRequestedOutput::Create(
778-
&triton_infer_output, name, class_count));
778+
&triton_infer_output, name, class_count, datatype));
779779
local_infer_output->output_.reset(triton_infer_output);
780780

781781
*infer_output = local_infer_output;
@@ -793,8 +793,9 @@ TritonInferRequestedOutput::SetSharedMemory(
793793
}
794794

795795

796-
TritonInferRequestedOutput::TritonInferRequestedOutput(const std::string& name)
797-
: InferRequestedOutput(BackendKind::TRITON, name)
796+
TritonInferRequestedOutput::TritonInferRequestedOutput(
797+
const std::string& name, const std::string& datatype)
798+
: InferRequestedOutput(BackendKind::TRITON, name, datatype)
798799
{
799800
}
800801

src/c++/perf_analyzer/client_backend/triton/triton_client_backend.h

+3-2
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,7 @@ class TritonInferRequestedOutput : public InferRequestedOutput {
299299
public:
300300
static Error Create(
301301
InferRequestedOutput** infer_output, const std::string& name,
302-
const size_t class_count = 0);
302+
const size_t class_count = 0, const std::string& datatype = "");
303303
/// Returns the raw InferRequestedOutput object required by triton client
304304
/// library.
305305
tc::InferRequestedOutput* Get() const { return output_.get(); }
@@ -309,7 +309,8 @@ class TritonInferRequestedOutput : public InferRequestedOutput {
309309
const size_t offset = 0) override;
310310

311311
private:
312-
explicit TritonInferRequestedOutput(const std::string& name);
312+
explicit TritonInferRequestedOutput(
313+
const std::string& name, const std::string& datatype);
313314

314315
std::unique_ptr<tc::InferRequestedOutput> output_;
315316
};

src/c++/perf_analyzer/client_backend/triton_c_api/triton_c_api_backend.cc

+2-2
Original file line numberDiff line numberDiff line change
@@ -335,14 +335,14 @@ TritonCApiInferInput::TritonCApiInferInput(
335335
Error
336336
TritonCApiInferRequestedOutput::Create(
337337
InferRequestedOutput** infer_output, const std::string& name,
338-
const size_t class_count)
338+
const size_t class_count, const std::string& datatype)
339339
{
340340
TritonCApiInferRequestedOutput* local_infer_output =
341341
new TritonCApiInferRequestedOutput(name);
342342

343343
tc::InferRequestedOutput* triton_infer_output;
344344
RETURN_IF_TRITON_ERROR(tc::InferRequestedOutput::Create(
345-
&triton_infer_output, name, class_count));
345+
&triton_infer_output, name, class_count, datatype));
346346
local_infer_output->output_.reset(triton_infer_output);
347347

348348
*infer_output = local_infer_output;

src/c++/perf_analyzer/client_backend/triton_c_api/triton_c_api_backend.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ class TritonCApiInferRequestedOutput : public InferRequestedOutput {
186186
public:
187187
static Error Create(
188188
InferRequestedOutput** infer_output, const std::string& name,
189-
const size_t class_count = 0);
189+
const size_t class_count = 0, const std::string& datatype = "");
190190
/// Returns the raw InferRequestedOutput object required by triton client
191191
/// library.
192192
tc::InferRequestedOutput* Get() const { return output_.get(); }

src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py

+1-18
Original file line numberDiff line numberDiff line change
@@ -460,28 +460,11 @@ def _parse_requests(self, requests: dict) -> LLMMetrics:
460460
num_generated_tokens,
461461
)
462462

463-
def _remove_leading_invalid_chars(self, text: str):
464-
if len(text) < 4:
465-
return text
466-
467-
for i, char in enumerate(text):
468-
# There will be 3 or 4 chars
469-
# (but sometimes the first char looks valid, so don't stop until we've seen at least 3)
470-
if char.isprintable() and i > 2:
471-
break
472-
473-
return text[i:]
474-
475463
def _preprocess_response(
476464
self, res_timestamps: list[int], res_outputs: list[dict[str, str]]
477465
) -> None:
478466
"""Helper function to preprocess responses of a request."""
479-
# FIXME -- remove this triton code once it is properly fixed in PA
480-
# (PA/triton will add junk to the start of the BYTES array. Remove it here)
481-
if self._service_kind == "triton":
482-
for d in res_outputs:
483-
d["text_output"] = self._remove_leading_invalid_chars(d["text_output"])
484-
elif self._service_kind == "openai":
467+
if self._service_kind == "openai":
485468
# remove the null final response in streaming mode
486469
last_response = res_outputs[-1]["response"]
487470
last_response = remove_sse_prefix(last_response)

src/c++/perf_analyzer/genai-perf/tests/test_llm_metrics.py

+13-17
Original file line numberDiff line numberDiff line change
@@ -385,23 +385,19 @@ def test_llm_metrics_get_base_name(self) -> None:
385385
{
386386
"timestamp": 1,
387387
"response_timestamps": [3, 5, 8],
388-
# FIXME - remove the whitespace once PA handles it.
389-
# LLMProfileDataParser preprocessse the responses
390-
# from triton server and removes first few chars.
391-
# Add whitespace to avoid valid chars being removed.
392388
"response_outputs": [
393-
{"text_output": " dogs"},
394-
{"text_output": " are"},
395-
{"text_output": " cool"},
389+
{"text_output": "dogs"},
390+
{"text_output": "are"},
391+
{"text_output": "cool"},
396392
],
397393
},
398394
{
399395
"timestamp": 2,
400396
"response_timestamps": [4, 7, 11],
401397
"response_outputs": [
402-
{"text_output": " I"},
403-
{"text_output": " don't"},
404-
{"text_output": " cook food"},
398+
{"text_output": "I"},
399+
{"text_output": "don't"},
400+
{"text_output": "cook food"},
405401
],
406402
},
407403
],
@@ -416,19 +412,19 @@ def test_llm_metrics_get_base_name(self) -> None:
416412
"timestamp": 5,
417413
"response_timestamps": [7, 8, 13, 18],
418414
"response_outputs": [
419-
{"text_output": " cats"},
420-
{"text_output": " are"},
421-
{"text_output": " cool"},
422-
{"text_output": " too"},
415+
{"text_output": "cats"},
416+
{"text_output": "are"},
417+
{"text_output": "cool"},
418+
{"text_output": "too"},
423419
],
424420
},
425421
{
426422
"timestamp": 3,
427423
"response_timestamps": [6, 8, 11],
428424
"response_outputs": [
429-
{"text_output": " it's"},
430-
{"text_output": " very"},
431-
{"text_output": " simple work"},
425+
{"text_output": "it's"},
426+
{"text_output": "very"},
427+
{"text_output": "simple work"},
432428
],
433429
},
434430
],

src/c++/perf_analyzer/infer_context.cc

+8
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,14 @@ InferContext::GetOutput(const cb::InferResult& infer_result)
188188
const uint8_t* buf{nullptr};
189189
size_t byte_size{0};
190190
infer_result.RawData(requested_output->Name(), &buf, &byte_size);
191+
192+
// The first 4 bytes of BYTES data is a 32-bit integer to indicate the size
193+
// of the rest of the data (which we already know based on byte_size). It
194+
// should be ignored here, as it isn't part of the actual response
195+
if (requested_output->Datatype() == "BYTES" && byte_size >= 4) {
196+
buf += 4;
197+
byte_size -= 4;
198+
}
191199
output.emplace(requested_output->Name(), ResponseData(buf, byte_size));
192200
}
193201
return output;

src/c++/perf_analyzer/infer_data_manager.cc

+4-3
Original file line numberDiff line numberDiff line change
@@ -175,11 +175,12 @@ InferDataManager::InitInferDataInput(
175175

176176
cb::Error
177177
InferDataManager::InitInferDataOutput(
178-
const std::string& name, InferData& infer_data)
178+
const std::string& name, const ModelTensor& model_tensor,
179+
InferData& infer_data)
179180
{
180181
cb::InferRequestedOutput* requested_output;
181-
RETURN_IF_ERROR(
182-
cb::InferRequestedOutput::Create(&requested_output, backend_kind_, name));
182+
RETURN_IF_ERROR(cb::InferRequestedOutput::Create(
183+
&requested_output, backend_kind_, name, model_tensor.datatype_));
183184
infer_data.outputs_.push_back(requested_output);
184185

185186
return cb::Error::Success;

src/c++/perf_analyzer/infer_data_manager.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,8 @@ class InferDataManager : public InferDataManagerBase {
7474
InferData& infer_data) override;
7575

7676
cb::Error InitInferDataOutput(
77-
const std::string& name, InferData& infer_data) override;
77+
const std::string& name, const ModelTensor& model_tensor,
78+
InferData& infer_data) override;
7879

7980
/// Helper function to update the inputs
8081
/// \param thread_id The ID of the calling thread

src/c++/perf_analyzer/infer_data_manager_base.cc

+2-1
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,8 @@ InferDataManagerBase::InitInferData(InferData& infer_data)
115115
}
116116

117117
for (const auto& output : *(parser_->Outputs())) {
118-
RETURN_IF_ERROR(InitInferDataOutput(output.first, infer_data));
118+
RETURN_IF_ERROR(
119+
InitInferDataOutput(output.first, output.second, infer_data));
119120
}
120121

121122
return cb::Error::Success;

0 commit comments

Comments
 (0)