Skip to content

Allow extra request inputs #552

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Apr 3, 2024
Merged
12 changes: 12 additions & 0 deletions src/c++/perf_analyzer/genai-perf/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,11 @@ options:
* `--num-prompts`: The number of unique prompts to generate.
* `--dataset`: HuggingFace dataset to use for benchmarking.

You can optionally set additional model inputs with the following option:
* `--extra-inputs {input_name}:{value}`: An additional input for use with the model with a singular value,
such as `stream:true` or `max_tokens:5`. This flag can be repeated to supply multiple extra inputs.


# Metrics

GenAI-Perf collects a diverse set of metrics that captures the performance of
Expand Down Expand Up @@ -241,6 +246,13 @@ both infer per second and latency.

Enables the use of the streaming API.

##### `--extra-inputs`

Provides an additional input for use with the model with a singular value,
such as `Stream:True` or `max_tokens:5`. This flag can be repeated to supply multiple extra inputs.



##### `--endpoint {v1/completions,v1/chat/completions}`

Describes what endpoint to send requests to on the server. This is required when
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def create_llm_inputs(
add_model_name: bool = False,
add_stream: bool = False,
tokenizer: AutoTokenizer = DEFAULT_TOKENIZER,
extra_inputs: Dict = {},
) -> Dict:
"""
Given an input type, input format, and output type. Output a string of LLM Inputs
Expand All @@ -109,9 +110,11 @@ def create_llm_inputs(
length:
Number of entries to gather
add_model_name:
If true adds a model name field to each payload
If true, adds a model name field to each payload
add_stream:
If true adds a steam field to each payload
If true, adds a steam field to each payload
extra_inputs:
If provided, append these inputs to every request

Required Synthetic Prompt Generation Parameters
-----------------------------------------------
Expand Down Expand Up @@ -164,7 +167,12 @@ def create_llm_inputs(
)

json_in_pa_format = LlmInputs._convert_generic_json_to_output_format(
output_format, generic_dataset_json, add_model_name, add_stream, model_name
output_format,
generic_dataset_json,
add_model_name,
add_stream,
model_name,
extra_inputs,
)
LlmInputs._write_json_to_file(json_in_pa_format)

Expand Down Expand Up @@ -309,24 +317,29 @@ def _convert_generic_json_to_output_format(
add_model_name: bool,
add_stream: bool,
model_name: str = "",
extra_inputs: Dict = {},
) -> Dict:
if output_format == OutputFormat.OPENAI_CHAT_COMPLETIONS:
output_json = (
LlmInputs._convert_generic_json_to_openai_chat_completions_format(
generic_dataset, add_model_name, add_stream, model_name
generic_dataset,
add_model_name,
add_stream,
model_name,
extra_inputs,
)
)
elif output_format == OutputFormat.OPENAI_COMPLETIONS:
output_json = LlmInputs._convert_generic_json_to_openai_completions_format(
generic_dataset, add_model_name, add_stream, model_name
generic_dataset, add_model_name, add_stream, model_name, extra_inputs
)
elif output_format == OutputFormat.VLLM:
output_json = LlmInputs._convert_generic_json_to_vllm_format(
generic_dataset, add_model_name, add_stream, model_name
generic_dataset, add_model_name, add_stream, model_name, extra_inputs
)
elif output_format == OutputFormat.TRTLLM:
output_json = LlmInputs._convert_generic_json_to_trtllm_format(
generic_dataset, add_model_name, add_stream, model_name
generic_dataset, add_model_name, add_stream, model_name, extra_inputs
)
else:
raise GenAIPerfException(
Expand All @@ -342,6 +355,7 @@ def _convert_generic_json_to_openai_chat_completions_format(
add_model_name: bool,
add_stream: bool,
model_name: str = "",
extra_inputs: Dict = {},
) -> Dict:
# TODO (TMA-1757): Implement a way to select a role for `text_input`
(
Expand All @@ -356,6 +370,7 @@ def _convert_generic_json_to_openai_chat_completions_format(
add_model_name,
add_stream,
model_name,
extra_inputs,
)

return pa_json
Expand All @@ -367,6 +382,7 @@ def _convert_generic_json_to_openai_completions_format(
add_model_name: bool,
add_stream: bool,
model_name: str = "",
extra_inputs: Dict = {},
) -> Dict:
(
system_role_headers,
Expand All @@ -381,6 +397,7 @@ def _convert_generic_json_to_openai_completions_format(
add_model_name,
add_stream,
model_name,
extra_inputs,
)

return pa_json
Expand All @@ -392,6 +409,7 @@ def _convert_generic_json_to_vllm_format(
add_model_name: bool,
add_stream: bool,
model_name: str = "",
extra_inputs: Dict = {},
) -> Dict:
(
system_role_headers,
Expand All @@ -407,6 +425,7 @@ def _convert_generic_json_to_vllm_format(
add_model_name,
add_stream,
model_name,
extra_inputs,
)

return pa_json
Expand All @@ -418,6 +437,7 @@ def _convert_generic_json_to_trtllm_format(
add_model_name: bool,
add_stream: bool,
model_name: str = "",
extra_inputs: Dict = {},
) -> Dict:
(
system_role_headers,
Expand All @@ -433,6 +453,7 @@ def _convert_generic_json_to_trtllm_format(
add_model_name,
add_stream,
model_name,
extra_inputs,
)

return pa_json
Expand Down Expand Up @@ -480,6 +501,7 @@ def _populate_openai_chat_completions_output_json(
add_model_name: bool,
add_stream: bool,
model_name: str = "",
extra_inputs: Dict = {},
) -> Dict:
pa_json = LlmInputs._create_empty_openai_pa_json()

Expand All @@ -497,7 +519,7 @@ def _populate_openai_chat_completions_output_json(
)

pa_json = LlmInputs._add_optional_tags_to_openai_json(
pa_json, index, add_model_name, add_stream, model_name
pa_json, index, add_model_name, add_stream, model_name, extra_inputs
)

return pa_json
Expand All @@ -512,6 +534,7 @@ def _populate_openai_completions_output_json(
add_model_name: bool,
add_stream: bool,
model_name: str = "",
extra_inputs: Dict = {},
) -> Dict:
pa_json = LlmInputs._create_empty_openai_pa_json()

Expand All @@ -531,7 +554,7 @@ def _populate_openai_completions_output_json(
pa_json = LlmInputs._add_new_prompt_to_json(pa_json, index, new_prompt)

pa_json = LlmInputs._add_optional_tags_to_openai_json(
pa_json, index, add_model_name, add_stream, model_name
pa_json, index, add_model_name, add_stream, model_name, extra_inputs
)

return pa_json
Expand All @@ -546,6 +569,7 @@ def _populate_vllm_output_json(
add_model_name: bool,
add_stream: bool,
model_name: str = "",
extra_inputs: Dict = {},
) -> Dict:
pa_json = LlmInputs._create_empty_vllm_pa_json()

Expand All @@ -566,7 +590,7 @@ def _populate_vllm_output_json(
)

pa_json = LlmInputs._add_optional_tags_to_vllm_json(
pa_json, index, add_model_name, add_stream, model_name
pa_json, index, add_model_name, add_stream, model_name, extra_inputs
)

return pa_json
Expand All @@ -581,6 +605,7 @@ def _populate_trtllm_output_json(
add_model_name: bool,
add_stream: bool,
model_name: str = "",
extra_inputs: Dict = {},
) -> Dict:
pa_json = LlmInputs._create_empty_trtllm_pa_json()

Expand All @@ -602,7 +627,7 @@ def _populate_trtllm_output_json(

pa_json = LlmInputs._add_required_tags_to_trtllm_json(pa_json, index)
pa_json = LlmInputs._add_optional_tags_to_trtllm_json(
pa_json, index, add_model_name, add_stream, model_name
pa_json, index, add_model_name, add_stream, model_name, extra_inputs
)

return pa_json
Expand Down Expand Up @@ -737,11 +762,14 @@ def _add_optional_tags_to_openai_json(
add_model_name: bool,
add_stream: bool,
model_name: str = "",
extra_inputs: Dict = {},
) -> Dict:
if add_model_name:
pa_json["data"][index]["payload"][0]["model"] = model_name
if add_stream:
pa_json["data"][index]["payload"][0]["stream"] = True
for key, value in extra_inputs.items():
pa_json["data"][index]["payload"][0][key] = value

return pa_json

Expand All @@ -753,11 +781,14 @@ def _add_optional_tags_to_vllm_json(
add_model_name: bool,
add_stream: bool,
model_name: str = "",
extra_inputs: Dict = {},
) -> Dict:
if add_model_name:
pa_json["data"][index]["model"] = model_name
if add_stream:
pa_json["data"][index]["stream"] = [True]
for key, value in extra_inputs.items():
pa_json["data"][index][key] = [value]

return pa_json

Expand All @@ -769,11 +800,14 @@ def _add_optional_tags_to_trtllm_json(
add_model_name: bool,
add_stream: bool,
model_name: str = "",
extra_inputs: Dict = {},
) -> Dict:
if add_model_name:
pa_json["data"][index]["model"] = model_name
if add_stream:
pa_json["data"][index]["stream"] = [True]
for key, value in extra_inputs.items():
pa_json["data"][index][key] = [value]

return pa_json

Expand Down
6 changes: 6 additions & 0 deletions src/c++/perf_analyzer/genai-perf/genai_perf/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ def generate_inputs(args: ArgumentParser, tokenizer: AutoTokenizer) -> None:
input_file_name = ""
# TODO (TMA-1759): review if add_model_name is always true
add_model_name = True
try:
extra_input_dict = parser.get_extra_inputs_as_dict(args)
except ValueError as e:
raise GenAIPerfException(e)

LlmInputs.create_llm_inputs(
input_type=args.prompt_source,
output_format=args.output_format,
Expand All @@ -61,6 +66,7 @@ def generate_inputs(args: ArgumentParser, tokenizer: AutoTokenizer) -> None:
add_model_name=add_model_name,
add_stream=args.streaming,
tokenizer=tokenizer,
extra_inputs=extra_input_dict,
)


Expand Down
55 changes: 50 additions & 5 deletions src/c++/perf_analyzer/genai-perf/genai_perf/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ def _check_conditional_args(
elif args.endpoint == "v1/completions":
args.output_format = OutputFormat.OPENAI_COMPLETIONS
elif args.endpoint is not None:
logger.warning(
"The --endpoint option is ignored when not using the 'openai' service-kind."
parser.error(
"The --endpoint option should only be used when using the 'openai' service-kind."
)
if args.service_kind == "triton":
args = _convert_str_to_enum_entry(args, "backend", OutputFormat)
Expand Down Expand Up @@ -113,6 +113,13 @@ def handler(args, extra_args):
def _add_input_args(parser):
input_group = parser.add_argument_group("Input")

input_group.add_argument(
"--extra-inputs",
action="append",
help="Provide additional inputs to include with every request. "
"You can repeat this flag for multiple inputs. Inputs should be in a input_name:value format.",
)

input_group.add_argument(
"--input-dataset",
type=str.lower,
Expand Down Expand Up @@ -242,9 +249,8 @@ def _add_endpoint_args(parser):
type=str,
choices=["v1/chat/completions", "v1/completions"],
required=False,
help="The endpoint to send requests to on the "
'server. This is required when using the "openai" service-kind. '
"This is ignored in other cases.",
help=f"The endpoint to send requests to on the "
'server. This is only used with the "openai" service-kind. ',
)

endpoint_group.add_argument(
Expand Down Expand Up @@ -318,6 +324,45 @@ def _add_other_args(parser):
)


def get_extra_inputs_as_dict(args: argparse.ArgumentParser) -> dict:
request_inputs = {}
if hasattr(args, "extra_inputs"):
input_name = ""
for input_str in args.extra_inputs:
try:
input_name, value = input_str.split(":", 1)
if not input_name or not value:
raise ValueError("Input_name or value is empty")
except ValueError:
raise ValueError(
f"Invalid input format for --extra-inputs: {input_str}\n"
"Expected input format: 'input_name:value'"
)

# Convert the value to a bool, int, or float if applicable
is_bool = value.lower() in ["true", "false"]
is_int = value.isdigit()
is_float = value.count(".") == 1 and (
value[0] == "." or value.replace(".", "").isdigit()
)

# Convert value to bool, int, or float if applicable
if is_bool:
value = value.lower() == "true"
elif is_int:
value = int(value)
elif is_float:
value = float(value)

if input_name in request_inputs:
raise ValueError(
f"Input name already exists in request_inputs dictionary: {input_name}"
)
request_inputs[input_name] = value

return request_inputs


### Entrypoint ###


Expand Down
1 change: 1 addition & 0 deletions src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def build_cmd(args, extra_args):
"input_format",
"model",
"backend",
"extra_inputs",
"output_format",
# The 'streaming' passed in to this script is to determine if the
# LLM response should be streaming. That is different than the
Expand Down
Loading
Loading